blob: 6681a7ca2605581dc4b821fecf1d737558ae79ff [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
Harish Mahendrakar74971912015-04-20 15:33:05 +053020@**
Hamsalekha S8d3d3032015-03-13 21:24:58 +053021@******************************************************************************
22@* @file
23@* ih264_inter_pred_chroma_a9q.s
24@*
25@* @brief
26@* Contains function definitions for inter prediction interpolation.
27@*
28@* @author
29@* Ittaim
30@*
31@* @par List of Functions:
32@*
33@* - ih264_inter_pred_chroma_a9q()
34@*
35@* @remarks
36@* None
37@*
38@*******************************************************************************
Harish Mahendrakar74971912015-04-20 15:33:05 +053039@*
Hamsalekha S8d3d3032015-03-13 21:24:58 +053040
Harish Mahendrakar74971912015-04-20 15:33:05 +053041@* All the functions here are replicated from ih264_inter_pred_filters.c
Hamsalekha S8d3d3032015-03-13 21:24:58 +053042@
43
Harish Mahendrakar74971912015-04-20 15:33:05 +053044@**
45@**
46@**
Hamsalekha S8d3d3032015-03-13 21:24:58 +053047@
Harish Mahendrakar74971912015-04-20 15:33:05 +053048@**
Hamsalekha S8d3d3032015-03-13 21:24:58 +053049@*******************************************************************************
50@*
51@* @brief
52@* Interprediction chroma filter
53@*
54@* @par Description:
55@* Applies filtering to chroma samples as mentioned in
56@* sec 8.4.2.2.2 titled "chroma sample interpolation process"
57@*
58@* @param[in] pu1_src
59@* UWORD8 pointer to the source containing alternate U and V samples
60@*
61@* @param[out] pu1_dst
62@* UWORD8 pointer to the destination
63@*
64@* @param[in] src_strd
65@* integer source stride
66@*
67@* @param[in] dst_strd
68@* integer destination stride
69@*
70@* @param[in]uc_dx
71@* dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
72@*
73@* @param[in] uc_dy
74@* dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
75@*
76@* @param[in] ht
77@* integer height of the array
78@*
79@* @param[in] wd
80@* integer width of the array
81@*
82@* @returns
83@*
84@* @remarks
85@* None
86@*
87@*******************************************************************************
Harish Mahendrakar74971912015-04-20 15:33:05 +053088@*
Hamsalekha S8d3d3032015-03-13 21:24:58 +053089
90@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
91@ UWORD8 *pu1_dst,
92@ WORD32 src_strd,
93@ WORD32 dst_strd,
94@ UWORD8 u1_dx,
95@ UWORD8 u1_dy,
96@ WORD32 ht,
97@ WORD32 wd)
98@**************Variables Vs Registers*****************************************
99@ r0 => *pu1_src
100@ r1 => *pu1_dst
101@ r2 => src_strd
102@ r3 => dst_strd
103@ r4 => u1_dx
104@ r5 => u1_dy
105@ r6 => height
106@ r7 => width
107@
108.text
109.p2align 2
110
111 .global ih264_inter_pred_chroma_a9q
112
113ih264_inter_pred_chroma_a9q:
114
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530115 stmfd sp!, {r4-r12, r14} @store register values to stack
116 vstmdb sp!, {d8-d15} @push neon registers to stack
117 ldr r4, [sp, #104]
118 ldr r5, [sp, #108]
119 ldr r6, [sp, #112]
120 ldr r7, [sp, #116]
121
122 rsb r8, r4, #8 @8-u1_dx
123 rsb r9, r5, #8 @8-u1_dy
124 mul r10, r8, r9
125 mul r11, r4, r9
126
127 vdup.u8 d28, r10
128 vdup.u8 d29, r11
129
130 mul r10, r8, r5
131 mul r11, r4, r5
132
133 vdup.u8 d30, r10
134 vdup.u8 d31, r11
135
136 subs r12, r7, #2 @if wd=4 branch to loop_4
137 beq loop_2
138 subs r12, r7, #4 @if wd=8 branch to loop_8
139 beq loop_4
140
141loop_8:
142 sub r6, #1
143 vld1.8 {d0, d1, d2}, [r0], r2 @ Load row0
144 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
145 vext.8 d3, d0, d1, #2
146 vext.8 d8, d5, d6, #2
147
148 vmull.u8 q5, d0, d28
149 vmlal.u8 q5, d5, d30
150 vmlal.u8 q5, d3, d29
151 vmlal.u8 q5, d8, d31
152 vext.8 d9, d6, d7, #2
153 vext.8 d4, d1, d2, #2
154
155inner_loop_8:
156 vmull.u8 q6, d6, d30
157 vmlal.u8 q6, d1, d28
158 vmlal.u8 q6, d9, d31
159 vmlal.u8 q6, d4, d29
160 vmov d0, d5
161 vmov d3, d8
162
163 vqrshrun.s16 d14, q5, #6
164 vmov d1, d6
165 vmov d4, d9
166
167 vld1.8 {d5, d6, d7}, [r0], r2 @ Load row1
168 vqrshrun.s16 d15, q6, #6
169
170 vext.8 d8, d5, d6, #2
171 subs r6, #1
172 vext.8 d9, d6, d7, #2
173 vst1.8 {q7}, [r1], r3 @ Store dest row
174
175 vmull.u8 q5, d0, d28
176 vmlal.u8 q5, d5, d30
177 vmlal.u8 q5, d3, d29
178 vmlal.u8 q5, d8, d31
179 bne inner_loop_8
180
181 vmull.u8 q6, d6, d30
182 vmlal.u8 q6, d1, d28
183 vmlal.u8 q6, d9, d31
184 vmlal.u8 q6, d4, d29
185
186 vqrshrun.s16 d14, q5, #6
187 vqrshrun.s16 d15, q6, #6
188
189 vst1.8 {q7}, [r1], r3 @ Store dest row
190
191 b end_func
192
193loop_4:
194 sub r6, #1
195 vld1.8 {d0, d1}, [r0], r2 @ Load row0
196 vld1.8 {d2, d3}, [r0], r2 @ Load row1
197 vext.8 d1, d0, d1, #2
198 vext.8 d3, d2, d3, #2
199
200 vmull.u8 q2, d2, d30
201 vmlal.u8 q2, d0, d28
202 vmlal.u8 q2, d3, d31
203 vmlal.u8 q2, d1, d29
204
205inner_loop_4:
206 subs r6, #1
207 vmov d0, d2
208 vmov d1, d3
209
210 vld1.8 {d2, d3}, [r0], r2 @ Load row1
211 vqrshrun.s16 d6, q2, #6
212
213 vext.8 d3, d2, d3, #2
214 vst1.8 {d6}, [r1], r3 @ Store dest row
215
216 vmull.u8 q2, d0, d28
217 vmlal.u8 q2, d2, d30
218 vmlal.u8 q2, d1, d29
219 vmlal.u8 q2, d3, d31
220 bne inner_loop_4
221
222 vqrshrun.s16 d6, q2, #6
223 vst1.8 {d6}, [r1], r3 @ Store dest row
224
225 b end_func
226
227loop_2:
228 vld1.8 {d0}, [r0], r2 @ Load row0
229 vext.8 d1, d0, d0, #2
230 vld1.8 {d2}, [r0], r2 @ Load row1
231 vext.8 d3, d2, d2, #2
232 vmull.u8 q2, d0, d28
233 vmlal.u8 q2, d1, d29
234 vmlal.u8 q2, d2, d30
235 vmlal.u8 q2, d3, d31
236 vld1.8 {d6}, [r0] @ Load row2
237 vqrshrun.s16 d4, q2, #6
238 vext.8 d7, d6, d6, #2
239 vst1.32 d4[0], [r1], r3 @ Store dest row0
240 vmull.u8 q4, d2, d28
241 vmlal.u8 q4, d3, d29
242 vmlal.u8 q4, d6, d30
243 vmlal.u8 q4, d7, d31
244 subs r6, #2
245 vqrshrun.s16 d8, q4, #6
246 vst1.32 d8[0], [r1], r3 @ Store dest row1
247 bne loop_2 @ repeat if ht=2
248
249end_func:
250 vldmia sp!, {d8-d15} @ Restore neon registers that were saved
251 ldmfd sp!, {r4-r12, pc} @ Restoring registers from stack
252