blob: 04942ae67b71c12f43730f8f1c6262e79ae8bc6d [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@******************************************************************************
20@* @file
21@* ihevc_inter_pred_filters_luma_vert.s
22@*
23@* @brief
24@* contains function definitions for inter prediction interpolation.
25@* functions are coded using neon intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@* parthiban v
31@*
32@* @par list of functions:
33@*
34@* - ihevc_inter_pred_luma_vert()
35@*
36@* @remarks
37@* none
38@*
39@*******************************************************************************
40@*/
41
42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43@/* include reconstruction */
44
45
46
47@/**
48@*******************************************************************************
49@*
50@* @brief
51@* interprediction luma filter for vertical input
52@*
53@* @par description:
54@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
55@* the elements pointed by 'pu1_src' and writes to the location pointed by
56@* 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits
57@* assumptions : the function is optimized considering the fact width is
58@* multiple of 4 or 8. and height as multiple of 2.
59@*
60@* @param[in] pu1_src
61@* uword8 pointer to the source
62@*
63@* @param[out] pu1_dst
64@* uword8 pointer to the destination
65@*
66@* @param[in] src_strd
67@* integer source stride
68@*
69@* @param[in] dst_strd
70@* integer destination stride
71@*
72@* @param[in] pi1_coeff
73@* word8 pointer to the filter coefficients
74@*
75@* @param[in] ht
76@* integer height of the array
77@*
78@* @param[in] wd
79@* integer width of the array
80@*
81@* @returns
82@*
83@* @remarks
84@* none
85@*
86@*******************************************************************************
87@*/
88
89@void ihevc_inter_pred_luma_vert (
90@ uword8 *pu1_src,
91@ uword8 *pu1_dst,
92@ word32 src_strd,
93@ word32 dst_strd,
94@ word8 *pi1_coeff,
95@ word32 ht,
96@ word32 wd )
97
98@**************variables vs registers*****************************************
99@ r0 => *pu1_src
100@ r1 => *pu1_dst
101@ r2 => src_strd
102@ r6 => dst_strd
103@ r12 => *pi1_coeff
104@ r5 => ht
105@ r3 => wd
106.text
107.align 4
108
109
110
111
112.globl ihevc_inter_pred_luma_vert_a9q
113
114.type ihevc_inter_pred_luma_vert_a9q, %function
115
116ihevc_inter_pred_luma_vert_a9q:
117
118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
119
120 ldr r12,[sp,#40] @load pi1_coeff
121 mov r6,r3
122 ldr r5,[sp,#48] @load wd
123 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
124 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
125 vabs.s8 d0,d0 @vabs_s8(coeff)
126 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
127 ldr r3,[sp,#44] @load ht
128 subs r7,r3,#0 @r3->ht
129 @ble end_loops @end loop jump
130 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
131 cmp r5,#8
132 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
133 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
134 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
135 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
136 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
137 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
138 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
139 blt core_loop_wd_4 @core loop wd 4 jump
140 str r0, [sp, #-4]!
141 str r1, [sp, #-4]!
142
143 bic r4,r5,#7 @r5 ->wd
144 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd
145 rsb r8,r4,r2,lsl #2 @r2->src_strd
146 mov r3, r5, lsr #3 @divide by 8
147 mul r7, r3 @multiply height by width
148 sub r7, #4 @subtract by one for epilog
149
150prolog:
151
152 and r10, r0, #31
153 add r3,r0,r2 @pu1_src_tmp += src_strd@
154 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
155 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
156 subs r4,r4,#8
157 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
158 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
159 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
160 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
161 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
162 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
163 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
164 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
165 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
166 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
167 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
168 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
169 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
170 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
171 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
172 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
173
174
175 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
176 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
177
178 addle r0,r0,r8
179 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
180
181 bicle r4,r5,#7 @r5 ->wd
182 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
183
184 pld [r3]
185 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
186 pld [r3, r2]
187 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
188 pld [r3, r2, lsl #1]
189 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
190
191 add r3, r3, r2
192 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
193
194 pld [r3, r2, lsl #1]
195 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
196
197 add r3,r0,r2 @pu1_src_tmp += src_strd@
198 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
199
200 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
201 vmull.u8 q6,d3,d23
202 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
203 vmlsl.u8 q6,d2,d22
204 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
205 vmlsl.u8 q6,d4,d24
206 vmlal.u8 q6,d5,d25
207 vmlal.u8 q6,d6,d26
208 vmlsl.u8 q6,d7,d27
209 vmlal.u8 q6,d16,d28
210 vmlsl.u8 q6,d17,d29
211 add r14,r1,r6
212 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
213 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
214 addle r1,r1,r9
215
216 vmull.u8 q7,d4,d23
217 subs r7,r7,#4
218 vmlsl.u8 q7,d3,d22
219 vmlsl.u8 q7,d5,d24
220 vmlal.u8 q7,d6,d25
221 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
222 vmlal.u8 q7,d7,d26
223 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
224 vmlsl.u8 q7,d16,d27
225 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
226 vmlal.u8 q7,d17,d28
227 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
228 vmlsl.u8 q7,d18,d29
229 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
230
231 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
232 vqrshrun.s16 d12,q6,#6
233
234
235 blt epilog_end @jumps to epilog_end
236 beq epilog @jumps to epilog
237
238kernel_8:
239
240 subs r4,r4,#8
241 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
242
243 addle r0,r0,r8
244 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
245
246 bicle r4,r5,#7 @r5 ->wd
247 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
248
249 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
250 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
251
252 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
253 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
254
255 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
256 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
257
258 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
259
260 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
261 vst1.8 {d12},[r14],r6
262
263@ and r11, r0, #31
264 vqrshrun.s16 d14,q7,#6
265
266 add r3,r0,r2 @pu1_src_tmp += src_strd@
267 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
268
269 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
270 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
271
272 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
273
274 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
275 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
276
277 vst1.8 {d14},[r14],r6
278 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
279
280 add r14,r1,#0
281 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
282
283 add r1, r1, #8
284 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
285
286 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
287
288 addle r1,r1,r9
289 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
290
291@ cmp r11, r10
292 vmull.u8 q6,d3,d23
293
294 add r10, r3, r2, lsl #3 @ 10*strd - 8+2
295 vmlsl.u8 q6,d2,d22
296
297 add r10, r10, r2 @ 11*strd
298 vmlsl.u8 q6,d4,d24
299
300 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
301 vmlal.u8 q6,d5,d25
302
303 vmlal.u8 q6,d6,d26
304 vst1.8 {d8},[r14],r6 @vst1_u8(pu1_dst,sto_res)@
305
306 pld [r10] @11+ 0
307 vmlsl.u8 q6,d7,d27
308
309 pld [r10, r2] @11+ 1*strd
310 vmlal.u8 q6,d16,d28
311
312 pld [r10, r2, lsl #1] @11+ 2*strd
313 vmlsl.u8 q6,d17,d29
314
315 add r10, r10, r2 @12*strd
316 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
317
318 pld [r10, r2, lsl #1] @11+ 3*strd
319 vmull.u8 q7,d4,d23
320
321@ mov r10, r11
322 vmlsl.u8 q7,d3,d22
323
324 subs r7,r7,#4
325 vmlsl.u8 q7,d5,d24
326
327 vmlal.u8 q7,d6,d25
328 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
329 vmlal.u8 q7,d7,d26
330 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
331 vmlsl.u8 q7,d16,d27
332 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
333 vmlal.u8 q7,d17,d28
334 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
335 vmlsl.u8 q7,d18,d29
336 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
337
338 vqrshrun.s16 d12,q6,#6
339 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
340
341
342
343 bgt kernel_8 @jumps to kernel_8
344
345epilog:
346
347 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
348 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
349 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
350 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
351 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
352 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
353 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
354 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
355 vst1.8 {d12},[r14],r6
356
357 vqrshrun.s16 d14,q7,#6
358
359 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
360 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
361 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
362 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
363 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
364 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
365 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
366 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
367 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
368 vst1.8 {d14},[r14],r6
369
370 vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
371
372 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
373 vmull.u8 q6,d3,d23
374 vmlsl.u8 q6,d2,d22
375 vmlsl.u8 q6,d4,d24
376 vmlal.u8 q6,d5,d25
377 vmlal.u8 q6,d6,d26
378 vmlsl.u8 q6,d7,d27
379 vmlal.u8 q6,d16,d28
380 vmlsl.u8 q6,d17,d29
381 add r14,r1,r6
382 vst1.8 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@
383 vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
384
385 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
386 vmull.u8 q7,d4,d23
387 vmlsl.u8 q7,d3,d22
388 vmlsl.u8 q7,d5,d24
389 vmlal.u8 q7,d6,d25
390 vmlal.u8 q7,d7,d26
391 vmlsl.u8 q7,d16,d27
392 vmlal.u8 q7,d17,d28
393 vmlsl.u8 q7,d18,d29
394
395 vst1.8 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
396 vqrshrun.s16 d12,q6,#6
397
398epilog_end:
399 vst1.8 {d12},[r14],r6
400 vqrshrun.s16 d14,q7,#6
401
402 vst1.8 {d14},[r14],r6
403
404
405end_loops:
406 tst r5,#7
407 ldr r1, [sp], #4
408 ldr r0, [sp], #4
409
410 ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
411 mov r5, #4
412 add r0, r0, #8
413 add r1, r1, #8
414 mov r7, #16
415 @
416
417core_loop_wd_4:
418 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
419 rsb r8,r5,r2,lsl #2 @r2->src_strd
420 vmov.i8 d4,#0
421
422outer_loop_wd_4:
423 subs r12,r5,#0
424 ble end_inner_loop_wd_4 @outer loop jump
425
426inner_loop_wd_4:
427 add r3,r0,r2
428 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
429 subs r12,r12,#4
430 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
431 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
432 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
433 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
434
435 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
436 add r0,r0,#4
437 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
438 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
439
440 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
441 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
442 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
443
444 vmull.u8 q4,d7,d23
445 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
446 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
447 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
448 vmlsl.u8 q4,d6,d22
449 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
450
451 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
452 vmlsl.u8 q4,d4,d24
453 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
454 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
455
456 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
457 vmlal.u8 q4,d5,d25
458 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
459 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
460
461 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
462 vmlal.u8 q4,d6,d26
463 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
464 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
465
466 vdup.u32 d4,d7[1]
467 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
468
469 vmlsl.u8 q4,d7,d27
470 vld1.u32 {d4[1]},[r3],r2
471 vmlal.u8 q4,d4,d28
472 vdup.u32 d5,d4[1]
473 vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
474
475 vld1.u32 {d5[1]},[r3]
476 add r3,r1,r6
477 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
478
479 vmlsl.u8 q4,d5,d29
480 vst1.32 {d0[1]},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
481 vqrshrun.s16 d8,q4,#6
482
483 vst1.32 {d8[0]},[r3],r6
484 add r1,r1,#4
485 vst1.32 {d8[1]},[r3]
486 bgt inner_loop_wd_4
487
488end_inner_loop_wd_4:
489 subs r7,r7,#4
490 add r1,r1,r9
491 add r0,r0,r8
492 bgt outer_loop_wd_4
493
494 ldmfd sp!, {r4-r12, r15} @reload the registers from sp
495
496
497
498@/**
499@*******************************************************************************
500@*
501@* @brief
502@* interprediction luma filter for vertical 16bit output
503@*
504@* @par description:
505@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
506@* the elements pointed by 'pu1_src' and writes to the location pointed by
507@* 'pu1_dst' no downshifting or clipping is done and the output is used as
508@* an input for weighted prediction assumptions : the function is optimized
509@* considering the fact width is multiple of 4 or 8. and height as multiple
510@* of 2.
511@*
512@* @param[in] pu1_src
513@* uword8 pointer to the source
514@*
515@* @param[out] pi2_dst
516@* word16 pointer to the destination
517@*
518@* @param[in] src_strd
519@* integer source stride
520@*
521@* @param[in] dst_strd
522@* integer destination stride
523@*
524@* @param[in] pi1_coeff
525@* word8 pointer to the filter coefficients
526@*
527@* @param[in] ht
528@* integer height of the array
529@*
530@* @param[in] wd
531@* integer width of the array
532@*
533@* @returns
534@*
535@* @remarks
536@* none
537@*
538@*******************************************************************************
539@*/
540
541@void ihevc_inter_pred_luma_vert_w16out(uword8 *pu1_src,
542@ word16 *pi2_dst,
543@ word32 src_strd,
544@ word32 dst_strd,
545@ word8 *pi1_coeff,
546@ word32 ht,
547@ word32 wd )
548
549@**************variables vs registers*****************************************
550@ r0 => *pu1_src
551@ r1 => *pu1_dst
552@ r2 => src_strd
553@ r6 => dst_strd
554@ r12 => *pi1_coeff
555@ r5 => ht
556@ r3 => wd
557
558
559
560.globl ihevc_inter_pred_luma_vert_w16out_a9q
561
562.type ihevc_inter_pred_luma_vert_w16out_a9q, %function
563
564ihevc_inter_pred_luma_vert_w16out_a9q:
565
566 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
567
568 ldr r12,[sp,#40] @load pi1_coeff
569 mov r6,r3
570 ldr r5,[sp,#48] @load wd
571 vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff)
572 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff
573 vabs.s8 d0,d0 @vabs_s8(coeff)
574 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff
575 ldr r3,[sp,#44] @load ht
576 subs r7,r3,#0 @r3->ht
577 @ble end_loops_16out @end loop jump
578 vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
579 cmp r5,#8
580 vdup.u8 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
581 vdup.u8 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
582 vdup.u8 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
583 vdup.u8 d26,d0[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
584 vdup.u8 d27,d0[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
585 vdup.u8 d28,d0[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
586 vdup.u8 d29,d0[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
587 blt core_loop_wd_4_16out @core loop wd 4 jump
588 str r0, [sp, #-4]!
589 str r1, [sp, #-4]!
590
591 bic r4,r5,#7 @r5 ->wd
592 rsb r9,r4,r6,lsl #2 @r6->dst_strd r5 ->wd
593 rsb r8,r4,r2,lsl #2 @r2->src_strd
594 mov r6, r6, lsl #1
595 mov r3, r5, lsr #3 @divide by 8
596 mul r7, r3 @multiply height by width
597 sub r7, #4 @subtract by one for epilog
598
599prolog_16out:
600
601 and r10, r0, #31
602 add r3,r0,r2 @pu1_src_tmp += src_strd@
603
604 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
605 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
606 subs r4,r4,#8
607 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
608 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
609 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
610 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
611 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
612 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
613 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
614 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
615 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
616 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
617 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
618 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
619 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
620 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
621 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
622 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
623
624
625 addle r0,r0,r8
626 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
627
628 bicle r4,r5,#7 @r5 ->wd
629 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
630
631 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
632 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
633
634 pld [r3]
635 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
636 pld [r3, r2]
637 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
638 pld [r3, r2, lsl #1]
639 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
640 add r3, r3, r2
641 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
642 pld [r3, r2, lsl #1]
643 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
644
645 add r3,r0,r2 @pu1_src_tmp += src_strd@
646 vmull.u8 q6,d3,d23
647 vld1.u8 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
648 vmlsl.u8 q6,d2,d22
649 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
650 vmlsl.u8 q6,d4,d24
651 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
652 vmlal.u8 q6,d5,d25
653 vmlal.u8 q6,d6,d26
654 vmlsl.u8 q6,d7,d27
655 vmlal.u8 q6,d16,d28
656 vmlsl.u8 q6,d17,d29
657 add r14,r1,r6
658 vst1.8 {d8, d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
659 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
660 addle r1,r1,r9,lsl #1
661
662 vmull.u8 q7,d4,d23
663 subs r7,r7,#4
664 vmlsl.u8 q7,d3,d22
665 vmlsl.u8 q7,d5,d24
666 vmlal.u8 q7,d6,d25
667 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
668 vmlal.u8 q7,d7,d26
669 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
670 vmlsl.u8 q7,d16,d27
671 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
672 vmlal.u8 q7,d17,d28
673 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
674 vmlsl.u8 q7,d18,d29
675 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
676
677 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
678 @vqrshrun.s16 d12,q6,#6
679
680
681 blt epilog_end_16out
682 beq epilog_16out @jumps to epilog
683
684kernel_8_16out:
685
686 subs r4,r4,#8
687 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
688
689 addle r0,r0,r8
690 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
691
692 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
693 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
694
695 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
696 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
697
698 bicle r4,r5,#7 @r5 ->wd
699 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
700
701 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
702 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
703
704 vst1.8 {d12,d13},[r14],r6
705 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
706
707 add r3,r0,r2 @pu1_src_tmp += src_strd@
708 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
709
710
711@ and r11, r0, #31
712 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
713
714 vst1.8 {d14,d15},[r14],r6
715 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
716
717 add r14,r1,r6
718 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
719
720 vld1.u8 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@
721 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
722
723 vld1.u8 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
724 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
725
726 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
727 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
728
729 addle r1,r1,r9,lsl #1
730 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
731
732@ cmp r11, r10
733 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
734
735 add r10, r3, r2, lsl #3 @ 10*strd - 8+2
736 vmull.u8 q6,d3,d23
737
738 add r10, r10, r2 @ 11*strd
739 vmlsl.u8 q6,d2,d22
740
741 pld [r10] @11+ 0
742 vmlsl.u8 q6,d4,d24
743
744 pld [r10, r2] @11+ 1*strd
745 vmlal.u8 q6,d5,d25
746
747 pld [r10, r2, lsl #1] @11+ 2*strd
748 vmlal.u8 q6,d6,d26
749
750 add r10, r10, r2 @12*strd
751 vmlsl.u8 q6,d7,d27
752
753 pld [r10, r2, lsl #1] @11+ 3*strd
754 vmlal.u8 q6,d16,d28
755
756@ mov r10, r11
757 vmlsl.u8 q6,d17,d29
758
759 vld1.u8 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
760 vmull.u8 q7,d4,d23
761
762 subs r7,r7,#4
763 vmlsl.u8 q7,d3,d22
764
765 vst1.8 {d10, d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
766 vmlsl.u8 q7,d5,d24
767
768 vld1.u8 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
769 vmlal.u8 q7,d6,d25
770
771 vld1.u8 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
772 vmlal.u8 q7,d7,d26
773
774 vld1.u8 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
775 vmlsl.u8 q7,d16,d27
776
777 vld1.u8 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
778 vmlal.u8 q7,d17,d28
779
780 vld1.u8 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@
781 vmlsl.u8 q7,d18,d29
782
783
784 bgt kernel_8_16out @jumps to kernel_8
785
786epilog_16out:
787
788 vmull.u8 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
789 vmlsl.u8 q4,d0,d22 @mul_res1 = vmlsl_u8(mul_res1, src_tmp1, coeffabs_0)@
790 vmlsl.u8 q4,d2,d24 @mul_res1 = vmlsl_u8(mul_res1, src_tmp3, coeffabs_2)@
791 vmlal.u8 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
792 vmlal.u8 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
793 vmlsl.u8 q4,d5,d27 @mul_res1 = vmlsl_u8(mul_res1, src_tmp2, coeffabs_5)@
794 vmlal.u8 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
795 vmlsl.u8 q4,d7,d29 @mul_res1 = vmlsl_u8(mul_res1, src_tmp4, coeffabs_7)@
796 vst1.8 {d12,d13},[r14],r6
797
798 @vqrshrun.s16 d14,q7,#6
799
800 vld1.u8 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@
801 vmull.u8 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
802 vmlsl.u8 q5,d1,d22 @mul_res2 = vmlsl_u8(mul_res2, src_tmp2, coeffabs_0)@
803 vmlsl.u8 q5,d3,d24 @mul_res2 = vmlsl_u8(mul_res2, src_tmp4, coeffabs_2)@
804 vmlal.u8 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
805 vmlal.u8 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
806 vmlsl.u8 q5,d6,d27 @mul_res2 = vmlsl_u8(mul_res2, src_tmp3, coeffabs_5)@
807 vmlal.u8 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
808 vmlsl.u8 q5,d16,d29 @mul_res2 = vmlsl_u8(mul_res2, src_tmp1, coeffabs_7)@
809 vst1.8 {d14,d15},[r14],r6
810
811 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
812
813 vld1.u8 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@
814 vmull.u8 q6,d3,d23
815 vmlsl.u8 q6,d2,d22
816 vmlsl.u8 q6,d4,d24
817 vmlal.u8 q6,d5,d25
818 vmlal.u8 q6,d6,d26
819 vmlsl.u8 q6,d7,d27
820 vmlal.u8 q6,d16,d28
821 vmlsl.u8 q6,d17,d29
822 add r14,r1,r6
823 vst1.8 {d8,d9},[r1]! @vst1_u8(pu1_dst,sto_res)@
824 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
825
826 vld1.u8 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@
827 vmull.u8 q7,d4,d23
828 vmlsl.u8 q7,d3,d22
829 vmlsl.u8 q7,d5,d24
830 vmlal.u8 q7,d6,d25
831 vmlal.u8 q7,d7,d26
832 vmlsl.u8 q7,d16,d27
833 vmlal.u8 q7,d17,d28
834 vmlsl.u8 q7,d18,d29
835
836 vst1.8 {d10,d11},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@
837 @vqrshrun.s16 d12,q6,#6
838
839epilog_end_16out:
840 vst1.8 {d12,d13},[r14],r6
841 @vqrshrun.s16 d14,q7,#6
842
843 vst1.8 {d14,d15},[r14],r6
844
845
846end_loops_16out:
847 tst r5,#7
848 ldr r1, [sp], #4
849 ldr r0, [sp], #4
850
851 ldmeqfd sp!,{r4-r12,r15} @reload the registers from sp
852 mov r5, #4
853 add r0, r0, #8
854 add r1, r1, #16
855 mov r7, #16
856 mov r6, r6, lsr #1
857
858 @
859
860core_loop_wd_4_16out:
861 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd
862 rsb r8,r5,r2,lsl #2 @r2->src_strd
863 vmov.i8 d4,#0
864 mov r6, r6, lsl #1
865
866outer_loop_wd_4_16out:
867 subs r12,r5,#0
868 ble end_inner_loop_wd_4_16out @outer loop jump
869
870inner_loop_wd_4_16out:
871 add r3,r0,r2
872 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
873 subs r12,r12,#4
874 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
875 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
876 vld1.u32 {d4[0]},[r0] @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0)@
877 vmull.u8 q0,d5,d23 @mul_res1 = vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)@
878
879 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
880 add r0,r0,#4
881 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
882 vmlsl.u8 q0,d4,d22 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)@
883
884 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
885 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
886 vmlsl.u8 q0,d6,d24 @mul_res1 = vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)@
887
888 vmull.u8 q4,d7,d23
889 vdup.u32 d4,d7[1] @src_tmp1 = vdup_lane_u32(src_tmp4, 1)@
890 vmull.u8 q1,d7,d25 @mul_res2 = vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3)@
891 vld1.u32 {d4[1]},[r3],r2 @src_tmp1 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 1)@
892 vmlsl.u8 q4,d6,d22
893 vmlal.u8 q0,d4,d26 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_4)@
894
895 vdup.u32 d5,d4[1] @src_tmp2 = vdup_lane_u32(src_tmp1, 1)@
896 vmlsl.u8 q4,d4,d24
897 vld1.u32 {d5[1]},[r3],r2 @src_tmp2 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp2, 1)@
898 vmlsl.u8 q1,d5,d27 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp2), coeffabs_5)@
899
900 vdup.u32 d6,d5[1] @src_tmp3 = vdup_lane_u32(src_tmp2, 1)@
901 vmlal.u8 q4,d5,d25
902 vld1.u32 {d6[1]},[r3],r2 @src_tmp3 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp3, 1)@
903 vmlal.u8 q0,d6,d28 @mul_res1 = vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_6)@
904
905 vdup.u32 d7,d6[1] @src_tmp4 = vdup_lane_u32(src_tmp3, 1)@
906 vmlal.u8 q4,d6,d26
907 vld1.u32 {d7[1]},[r3],r2 @src_tmp4 = vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp4, 1)@
908 vmlsl.u8 q1,d7,d29 @mul_res2 = vmlsl_u8(mul_res2, vreinterpret_u8_u32(src_tmp4), coeffabs_7)@
909
910 vdup.u32 d4,d7[1]
911 vadd.i16 q0,q0,q1 @mul_res1 = vaddq_u16(mul_res1, mul_res2)@
912
913 vmlsl.u8 q4,d7,d27
914 vld1.u32 {d4[1]},[r3],r2
915 vmlal.u8 q4,d4,d28
916 vdup.u32 d5,d4[1]
917 @vqrshrun.s16 d0,q0,#6 @sto_res = vqmovun_s16(sto_res_tmp)@
918
919 vld1.u32 {d5[1]},[r3]
920 add r3,r1,r6
921 vst1.32 {d0},[r1]! @vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0)@
922
923 vmlsl.u8 q4,d5,d29
924 vst1.32 {d1},[r3],r6 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1)@
925 @vqrshrun.s16 d8,q4,#6
926
927 vst1.32 {d8},[r3],r6
928 @add r1,r1,#4
929 vst1.32 {d9},[r3]
930 bgt inner_loop_wd_4_16out
931
932end_inner_loop_wd_4_16out:
933 subs r7,r7,#4
934 add r1,r1,r9,lsl #1
935 add r0,r0,r8
936 bgt outer_loop_wd_4_16out
937
938 ldmfd sp!, {r4-r12, r15} @reload the registers from sp
939
940
941
942
943
944
945
946
947