blob: ab1d1d14855112f91563d82da77b479047e9c4a2 [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
Harish Mahendrakar74971912015-04-20 15:33:05 +053020@**
Hamsalekha S8d3d3032015-03-13 21:24:58 +053021@******************************************************************************
22@* @file
23@* ih264_inter_pred_luma_horz_qpel_a9q.s
24@*
25@* @brief
26@* Contains function definitions for inter prediction horizontal quarter pel interpolation.
27@*
28@* @author
29@* Mohit
30@*
31@* @par List of Functions:
32@*
Harish Mahendrakar74971912015-04-20 15:33:05 +053033@* - ih264_inter_pred_luma_horz_qpel_a9q()
Hamsalekha S8d3d3032015-03-13 21:24:58 +053034@*
35@* @remarks
36@* None
37@*
38@*******************************************************************************
Harish Mahendrakar74971912015-04-20 15:33:05 +053039@*
Hamsalekha S8d3d3032015-03-13 21:24:58 +053040
Harish Mahendrakar74971912015-04-20 15:33:05 +053041@* All the functions here are replicated from ih264_inter_pred_filters.c
Hamsalekha S8d3d3032015-03-13 21:24:58 +053042@
43
Harish Mahendrakar74971912015-04-20 15:33:05 +053044@**
45@**
Hamsalekha S8d3d3032015-03-13 21:24:58 +053046@*******************************************************************************
47@*
48@* @brief
49@* Quarter pel interprediction luma filter for horizontal input
50@*
51@* @par Description:
52@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
53@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54@*
55@* @param[in] pu1_src
56@* UWORD8 pointer to the source
57@*
58@* @param[out] pu1_dst
59@* UWORD8 pointer to the destination
60@*
61@* @param[in] src_strd
62@* integer source stride
63@*
64@* @param[in] dst_strd
65@* integer destination stride
66@*
67@* @param[in] ht
68@* integer height of the array
69@*
70@* @param[in] wd
71@* integer width of the array
72@*
73@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74@*
75@* @param[in] dydx: x and y reference offset for qpel calculations.
76@* @returns
77@*
78@ @remarks
79@* None
80@*
81@*******************************************************************************
Harish Mahendrakar74971912015-04-20 15:33:05 +053082@*
Hamsalekha S8d3d3032015-03-13 21:24:58 +053083
84@void ih264_inter_pred_luma_horz (
85@ UWORD8 *pu1_src,
86@ UWORD8 *pu1_dst,
87@ WORD32 src_strd,
88@ WORD32 dst_strd,
89@ WORD32 ht,
90@ WORD32 wd,
91@ UWORD8* pu1_tmp,
92@ UWORD32 dydx)
93
94@**************Variables Vs Registers*****************************************
95@ r0 => *pu1_src
96@ r1 => *pu1_dst
97@ r2 => src_strd
98@ r3 => dst_strd
99@ r5 => ht
100@ r6 => wd
101@ r7 => dydx
102
103.text
104.p2align 2
105
106
107 .global ih264_inter_pred_luma_horz_qpel_a9q
108
109ih264_inter_pred_luma_horz_qpel_a9q:
110
111 stmfd sp!, {r4-r12, r14} @store register values to stack
112 vstmdb sp!, {d8-d15} @push neon registers to stack
113 ldr r5, [sp, #104] @Loads ht
114 ldr r6, [sp, #108] @Loads wd
115 ldr r7, [sp, #116] @Loads dydx
116 and r7, r7, #3 @Finds x-offset
117 add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1)
118 sub r0, r0, #2 @pu1_src-2
119 vmov.i8 d0, #5 @filter coeff
120 subs r12, r6, #8 @if wd=8 branch to loop_8
121 vmov.i8 d1, #20 @filter coeff
122
123 beq loop_8
124
125 subs r12, r6, #4 @if wd=4 branch to loop_4
126 beq loop_4
127
128loop_16: @when wd=16
Harish Mahendrakar74971912015-04-20 15:33:05 +0530129 @ Processing row0 and row1
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530130 vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0
131 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
132 vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
133 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0)
134 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
135 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
136 vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0)
137 vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1)
138 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
139 vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0)
140 vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1)
141 vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0)
142 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0)
143 vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1)
144 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0)
145 vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1)
146 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1)
147 vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0)
148 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1)
149 vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0)
150 vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
151 vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1)
152 vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0)
153 vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1)
154 vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
155 vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0)
156 vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1)
157 vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0)
158 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
159 vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1)
160 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
161 vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1)
162 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
163 vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0)
164 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
165 vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0)
166 vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
167 vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1)
168 vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
169 vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1)
170 vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
171 vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
172 vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0)
173 vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
174 vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
175 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2)
176 vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation
177 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
178 vst1.8 {d20, d21}, [r1], r3 @//Store dest row0
179 vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2)
180 vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
181 vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1)
182 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
183 vst1.8 {d18, d19}, [r1], r3 @//Store dest row1
184 subs r5, r5, #2 @ 2 rows done, decrement by 2
185
186 beq end_func
187 b loop_16
188
189loop_8:
Harish Mahendrakar74971912015-04-20 15:33:05 +0530190@ Processing row0 and row1
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530191
192 vld1.8 {d5, d6}, [r0], r2 @// Load row1
193 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
194 vld1.8 {d2, d3}, [r0], r2 @// Load row0
195 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
196 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
197 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
198 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
199 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
200 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
201 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
202 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
203 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
204 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
205 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
206 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
207 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
208 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
209 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
210 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
211 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
212 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
213 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
214 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
215 vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
216 vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
217 vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
218 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
219 vst1.8 {d18}, [r1], r3 @//Store dest row0
220 vst1.8 {d19}, [r1], r3 @//Store dest row1
221 subs r5, r5, #2 @ 2 rows done, decrement by 2
222
223 beq end_func @ Branch if height==4
Harish Mahendrakar74971912015-04-20 15:33:05 +0530224 b loop_8 @looping if height == 8 or 16
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530225
226loop_4:
227 vld1.8 {d5, d6}, [r0], r2 @// Load row1
228 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
229 vld1.8 {d2, d3}, [r0], r2 @// Load row0
230 vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1)
231 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
232 vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1)
233 vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1)
234 vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1)
235 vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1)
236 vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0)
237 vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1)
238 vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1)
239 vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
240 vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
241 vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0)
242 vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0)
243 vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0)
244 vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1)
245 vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0)
246 vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0)
247 vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0)
248 vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0)
249 vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
250 vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
251 vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
252 vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
253 vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation
254 vst1.32 d18[0], [r1], r3 @//Store dest row0
255 vst1.32 d19[0], [r1], r3 @//Store dest row1
256
257 subs r5, r5, #2 @ 2 rows done, decrement by 2
258 beq end_func
259
260 b loop_4
261
262end_func:
263 vldmia sp!, {d8-d15} @ Restore neon registers that were saved
264 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
265
266