blob: 546c807e5d0c5b3fbceec8eda44907be08877cd9 [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//* ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
24//*
25//* @brief
26//* Contains function definitions for inter prediction interpolation.
27//*
28//* @author
29//* Mohit
30//*
31//* @par List of Functions:
32//*
33//* - ih264_inter_pred_luma_horz_hpel_vert_qpel_av8()
34//*
35//* @remarks
36//* None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46///**
47//*******************************************************************************
48//*
49//* @brief
50//* This function implements a two stage cascaded six tap filter. It
51//* applies the six tap filter in the horizontal direction on the
52//* predictor values, followed by applying the same filter in the
53//* vertical direction on the output of the first stage. It then averages
54//* the output of the 1st stage and the output of the 2nd stage to obtain
55//* the quarter pel values. The six tap filtering operation is described
56//* in sec 8.4.2.2.1 titled "Luma sample interpolation process".
57//*
58//* @par Description:
59//* This function is called to obtain pixels lying at the following
60//* location (1/2,1/4) or (1/2,3/4). The function interpolates
61//* the predictors first in the horizontal direction and then in the
62//* vertical direction to output the (1/2,1/2). It then averages
63//* the output of the 2nd stage and (1/2,1/2) value to obtain (1/2,1/4)
64//* or (1/2,3/4) depending on the offset.
65//*
66//* @param[in] pu1_src
67//* UWORD8 pointer to the source
68//*
69//* @param[out] pu1_dst
70//* UWORD8 pointer to the destination
71//*
72//* @param[in] src_strd
73//* integer source stride
74//*
75//* @param[in] dst_strd
76//* integer destination stride
77//*
78//* @param[in] ht
79//* integer height of the array
80//*
81//* @param[in] wd
82//* integer width of the array
83//*
84//* @param[in] pu1_tmp: temporary buffer
85//*
86//* @param[in] dydx: x and y reference offset for qpel calculations
87//*
88//* @returns
89//*
90//* @remarks
91//* None
92//*
93//*******************************************************************************
94//*/;
95
96//void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
97// UWORD8 *pu1_dst,
98// WORD32 src_strd,,
99// WORD32 dst_strd,
100// WORD32 ht,
101// WORD32 wd,
102// UWORD8* pu1_tmp,
103// UWORD32 dydx)
104
105//**************Variables Vs Registers*****************************************
106// x0 => *pu1_src
107// x1 => *pu1_dst
108// x2 => src_strd
109// x3 => dst_strd
110// x4 => ht
111// x5 => wd
112// x7 => dydx
113// x9 => *pu1_tmp
114
115.text
116.p2align 2
117.include "ih264_neon_macros.s"
118
119
120
121 .global ih264_inter_pred_luma_horz_hpel_vert_qpel_av8
122
123ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
124
125
126 // store register values to stack
127 push_v_regs
128 stp x19, x20, [sp, #-16]!
129
130
131
132 sub x0, x0, x2, lsl #1 // pu1_src-2*src_strd
133 sub x0, x0, #2 // pu1_src-2
134
135 mov x9, x6
136
137 lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
138
139 add x7, x7, #2
140 mov x6, #48
141 madd x7, x7, x6, x9
142
143 subs x12, x5, #4 //if wd=4 branch to loop_4
144 beq loop_4_start
145
146 subs x12, x5, #8 //if wd=8 branch to loop_8
147 beq loop_8_start
148
149 //when wd=16
150 movi v22.8h, #20 // Filter coeff 0x14 into Q11
151 movi v24.8h, #5 // Filter coeff 0x5 into Q12
152 add x8, x0, #8
153 add x14, x1, #8
154 add x10, x9, #8
155 mov x12, x4
156 add x11, x7, #8
157loop_16_lowhalf_start:
158 ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
159 ext v5.8b, v0.8b , v1.8b , #5
160 uaddl v6.8h, v0.8b, v5.8b
161
162 ext v2.8b, v0.8b , v1.8b , #2
163 ext v3.8b, v0.8b , v1.8b , #3
164 uaddl v8.8h, v2.8b, v3.8b
165 ext v4.8b, v0.8b , v1.8b , #4
166 mla v6.8h, v8.8h , v22.8h
167 ext v1.8b, v0.8b , v1.8b , #1
168 uaddl v8.8h, v1.8b, v4.8b
169 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
170 mls v6.8h, v8.8h , v24.8h
171 ext v5.8b, v0.8b , v1.8b , #5
172 uaddl v8.8h, v0.8b, v5.8b
173 ext v2.8b, v0.8b , v1.8b , #2
174 ext v3.8b, v0.8b , v1.8b , #3
175 uaddl v10.8h, v2.8b, v3.8b
176
177 st1 {v6.4s}, [x9], x6 // store temp buffer 0
178
179 ext v4.8b, v0.8b , v1.8b , #4
180 mla v8.8h, v10.8h , v22.8h
181 ext v1.8b, v0.8b , v1.8b , #1
182 uaddl v10.8h, v1.8b, v4.8b
183 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
184 mls v8.8h, v10.8h , v24.8h
185 ext v5.8b, v0.8b , v1.8b , #5
186 uaddl v10.8h, v0.8b, v5.8b
187 ext v2.8b, v0.8b , v1.8b , #2
188 ext v3.8b, v0.8b , v1.8b , #3
189 uaddl v12.8h, v2.8b, v3.8b
190
191 st1 {v8.4s}, [x9], x6 // store temp buffer 1
192
193 ext v4.8b, v0.8b , v1.8b , #4
194 mla v10.8h, v12.8h , v22.8h
195 ext v1.8b, v0.8b , v1.8b , #1
196 uaddl v12.8h, v1.8b, v4.8b
197 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
198 mls v10.8h, v12.8h , v24.8h
199 ext v5.8b, v0.8b , v1.8b , #5
200 uaddl v12.8h, v0.8b, v5.8b
201 ext v2.8b, v0.8b , v1.8b , #2
202 ext v3.8b, v0.8b , v1.8b , #3
203 uaddl v14.8h, v2.8b, v3.8b
204
205 st1 {v10.4s}, [x9], x6 // store temp buffer 2
206
207 ext v4.8b, v0.8b , v1.8b , #4
208 mla v12.8h, v14.8h , v22.8h
209 ext v1.8b, v0.8b , v1.8b , #1
210 uaddl v14.8h, v1.8b, v4.8b
211 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
212 mls v12.8h, v14.8h , v24.8h
213 ext v5.8b, v0.8b , v1.8b , #5
214 uaddl v14.8h, v0.8b, v5.8b
215 ext v2.8b, v0.8b , v1.8b , #2
216 ext v3.8b, v0.8b , v1.8b , #3
217 uaddl v16.8h, v2.8b, v3.8b
218
219 st1 {v12.4s}, [x9], x6 // store temp buffer 3
220
221 ext v4.8b, v0.8b , v1.8b , #4
222 mla v14.8h, v16.8h , v22.8h
223 ext v1.8b, v0.8b , v1.8b , #1
224 uaddl v16.8h, v1.8b, v4.8b
225
226 mls v14.8h, v16.8h , v24.8h
227loop_16_lowhalf:
228
229 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
230 ext v5.8b, v0.8b , v1.8b , #5
231 ext v2.8b, v0.8b , v1.8b , #2
232 ext v3.8b, v0.8b , v1.8b , #3
233 uaddl v16.8h, v0.8b, v5.8b
234
235 st1 {v14.4s}, [x9], x6 // store temp buffer 4
236
237 uaddl v18.8h, v2.8b, v3.8b
238 ext v4.8b, v0.8b , v1.8b , #4
239 mla v16.8h, v18.8h , v22.8h
240 ext v1.8b, v0.8b , v1.8b , #1
241 add v28.8h, v8.8h , v14.8h
242 uaddl v18.8h, v1.8b, v4.8b
243 add v30.8h, v10.8h , v12.8h
244 mls v16.8h, v18.8h , v24.8h
245 ld1 {v0.2s, v1.2s}, [x0], x2 // row 4 load for hoorizontal filter
246 ext v5.8b, v0.8b , v1.8b , #5
247 ext v2.8b, v0.8b , v1.8b , #2
248 ext v3.8b, v0.8b , v1.8b , #3
249 uaddl v20.8h, v0.8b, v5.8b
250
251 st1 {v16.4s}, [x9], x6 // store temp buffer x5
252
253 saddl v18.4s, v6.4h, v16.4h
254
255 ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
256
257 saddl2 v6.4s, v6.8h, v16.8h
258
259 sqrshrun v26.8b, v26.8h, #5
260
261 smlal v18.4s, v30.4h, v22.4h
262 smlsl v18.4s, v28.4h, v24.4h
263 smlal2 v6.4s, v30.8h, v22.8h
264 smlsl2 v6.4s, v28.8h, v24.8h
265 uaddl v2.8h, v2.8b, v3.8b
266 ext v4.8b, v0.8b , v1.8b , #4
267 mla v20.8h, v2.8h , v22.8h
268 sqrshrun v18.4h, v18.4s, #10
269 ext v1.8b, v0.8b , v1.8b , #1
270 sqrshrun v19.4h, v6.4s, #10
271 add v28.8h, v10.8h , v16.8h
272 uaddl v2.8h, v1.8b, v4.8b
273 add v30.8h, v12.8h , v14.8h
274 mls v20.8h, v2.8h , v24.8h
275
276 uqxtn v18.8b, v18.8h
277 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300278 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530279
280 ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
281
282 urhadd v26.8b, v18.8b , v26.8b
283
284 ext v5.8b, v0.8b , v1.8b , #5
285 ext v2.8b, v0.8b , v1.8b , #2
286
287 st1 {v20.4s}, [x9], x6 // store temp buffer x6
288
289 saddl v18.4s, v8.4h, v20.4h
290
291 saddl2 v6.4s, v8.8h, v20.8h
292
293 ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
294
295
296 st1 {v26.2s}, [x1], x3 // store row 0
297
298 smlal v18.4s, v30.4h, v22.4h
299 smlsl v18.4s, v28.4h, v24.4h
300 smlal2 v6.4s, v30.8h, v22.8h
301 smlsl2 v6.4s, v28.8h, v24.8h
302
303 sqrshrun v28.8b, v8.8h, #5
304 ext v3.8b, v0.8b , v1.8b , #3
305 uaddl v8.8h, v0.8b, v5.8b
306 uaddl v2.8h, v2.8b, v3.8b
307 sqrshrun v18.4h, v18.4s, #10
308 ext v4.8b, v0.8b , v1.8b , #4
309 sqrshrun v19.4h, v6.4s, #10
310 mla v8.8h, v2.8h , v22.8h
311 ext v1.8b, v0.8b , v1.8b , #1
312 add v26.8h, v12.8h , v20.8h
313 uaddl v2.8h, v1.8b, v4.8b
314 uqxtn v18.8b, v18.8h
315 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300316 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530317 add v30.8h, v14.8h , v16.8h
318 mls v8.8h, v2.8h , v24.8h
319 ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
320
321 urhadd v28.8b, v28.8b , v18.8b
322
323 ext v5.8b, v0.8b , v1.8b , #5
324 ext v2.8b, v0.8b , v1.8b , #2
325 ext v3.8b, v0.8b , v1.8b , #3
326
327 st1 {v28.2s}, [x1], x3 // store row 1
328
329 uaddl v28.8h, v0.8b, v5.8b
330
331 st1 {v8.4s}, [x9], x6 // store temp buffer x7
332
333 saddl v18.4s, v10.4h, v8.4h
334 saddl2 v6.4s, v10.8h, v8.8h
335
336 ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
337
338 smlal v18.4s, v30.4h, v22.4h
339 smlsl v18.4s, v26.4h, v24.4h
340
341 smlal2 v6.4s, v30.8h, v22.8h
342 smlsl2 v6.4s, v26.8h, v24.8h
343
344 sqrshrun v26.8b, v10.8h, #5
345
346 uaddl v2.8h, v2.8b, v3.8b
347 ext v4.8b, v0.8b , v1.8b , #4
348 mla v28.8h, v2.8h , v22.8h
349 sqrshrun v18.4h, v18.4s, #10
350 ext v1.8b, v0.8b , v1.8b , #1
351 sqrshrun v19.4h, v6.4s, #10
352 add v10.8h, v14.8h , v8.8h
353 uaddl v2.8h, v1.8b, v4.8b
354 add v30.8h, v16.8h , v20.8h
355 mls v28.8h, v2.8h , v24.8h
356 uqxtn v27.8b, v18.8h
357 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300358 mov v27.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530359 saddl v18.4s, v12.4h, v28.4h
360 saddl2 v6.4s, v12.8h, v28.8h
361
362 urhadd v26.8b, v26.8b , v27.8b
363
364 smlal v18.4s, v30.4h, v22.4h
365 smlsl v18.4s, v10.4h, v24.4h
366 smlal2 v6.4s, v30.8h, v22.8h
367 smlsl2 v6.4s, v10.8h, v24.8h
368
369 st1 {v26.2s}, [x1], x3 // store row 2
370
371 st1 {v28.2s, v29.2s}, [x9]
372
373
374 sqrshrun v18.4h, v18.4s, #10
375
376 mov v10.16b, v20.16b
377 mov v11.16b, v21.16b
378 ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
379
380 sqrshrun v19.4h, v6.4s, #10
381 subs x4, x4, #4
382
383 sqrshrun v30.8b, v30.8h, #5
384
385 uqxtn v18.8b, v18.8h
386 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300387 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530388
389 mov v12.16b, v8.16b
390 mov v13.16b, v9.16b
391 mov v6.16b, v14.16b
392 mov v7.16b, v15.16b
393
394 urhadd v30.8b, v18.8b , v30.8b
395
396 mov v8.16b, v16.16b
397 mov v9.16b, v17.16b
398 mov v14.16b, v28.16b
399 mov v15.16b, v29.16b
400
401 st1 {v30.2s}, [x1], x3 // store row 3
402
403 bgt loop_16_lowhalf // looping if height =16
404
405
406loop_16_highhalf_start:
407 ld1 {v0.2s, v1.2s}, [x8], x2
408 ext v5.8b, v0.8b , v1.8b , #5
409 uaddl v6.8h, v0.8b, v5.8b
410 ext v2.8b, v0.8b , v1.8b , #2
411 ext v3.8b, v0.8b , v1.8b , #3
412 uaddl v8.8h, v2.8b, v3.8b
413 ext v4.8b, v0.8b , v1.8b , #4
414 mla v6.8h, v8.8h , v22.8h
415 ext v1.8b, v0.8b , v1.8b , #1
416 uaddl v8.8h, v1.8b, v4.8b
417 ld1 {v0.2s, v1.2s}, [x8], x2
418 mls v6.8h, v8.8h , v24.8h
419 ext v5.8b, v0.8b , v1.8b , #5
420 uaddl v8.8h, v0.8b, v5.8b
421 ext v2.8b, v0.8b , v1.8b , #2
422 ext v3.8b, v0.8b , v1.8b , #3
423 uaddl v10.8h, v2.8b, v3.8b
424
425 st1 {v6.4s}, [x10], x6
426
427 ext v4.8b, v0.8b , v1.8b , #4
428 mla v8.8h, v10.8h , v22.8h
429 ext v1.8b, v0.8b , v1.8b , #1
430 uaddl v10.8h, v1.8b, v4.8b
431 ld1 {v0.2s, v1.2s}, [x8], x2
432 mls v8.8h, v10.8h , v24.8h
433 ext v5.8b, v0.8b , v1.8b , #5
434 uaddl v10.8h, v0.8b, v5.8b
435 ext v2.8b, v0.8b , v1.8b , #2
436 ext v3.8b, v0.8b , v1.8b , #3
437 uaddl v12.8h, v2.8b, v3.8b
438
439 st1 {v8.4s}, [x10], x6
440
441 ext v4.8b, v0.8b , v1.8b , #4
442 mla v10.8h, v12.8h , v22.8h
443 ext v1.8b, v0.8b , v1.8b , #1
444 uaddl v12.8h, v1.8b, v4.8b
445 ld1 {v0.2s, v1.2s}, [x8], x2
446 mls v10.8h, v12.8h , v24.8h
447 ext v5.8b, v0.8b , v1.8b , #5
448 uaddl v12.8h, v0.8b, v5.8b
449 ext v2.8b, v0.8b , v1.8b , #2
450 ext v3.8b, v0.8b , v1.8b , #3
451 uaddl v14.8h, v2.8b, v3.8b
452
453 st1 {v10.4s}, [x10], x6
454
455 ext v4.8b, v0.8b , v1.8b , #4
456 mla v12.8h, v14.8h , v22.8h
457 ext v1.8b, v0.8b , v1.8b , #1
458 uaddl v14.8h, v1.8b, v4.8b
459 ld1 {v0.2s, v1.2s}, [x8], x2
460 mls v12.8h, v14.8h , v24.8h
461 ext v5.8b, v0.8b , v1.8b , #5
462 uaddl v14.8h, v0.8b, v5.8b
463 ext v2.8b, v0.8b , v1.8b , #2
464 ext v3.8b, v0.8b , v1.8b , #3
465 uaddl v16.8h, v2.8b, v3.8b
466
467 st1 {v12.4s}, [x10], x6
468
469 ext v4.8b, v0.8b , v1.8b , #4
470 mla v14.8h, v16.8h , v22.8h
471 ext v1.8b, v0.8b , v1.8b , #1
472 uaddl v16.8h, v1.8b, v4.8b
473
474 mls v14.8h, v16.8h , v24.8h
475
476loop_16_highhalf:
477
478 ld1 {v0.2s, v1.2s}, [x8], x2
479 ext v5.8b, v0.8b , v1.8b , #5
480 ext v2.8b, v0.8b , v1.8b , #2
481 ext v3.8b, v0.8b , v1.8b , #3
482 uaddl v16.8h, v0.8b, v5.8b
483
484 st1 {v14.4s}, [x10], x6
485
486 uaddl v18.8h, v2.8b, v3.8b
487 ext v4.8b, v0.8b , v1.8b , #4
488 mla v16.8h, v18.8h , v22.8h
489 ext v1.8b, v0.8b , v1.8b , #1
490 add v28.8h, v8.8h , v14.8h
491 uaddl v18.8h, v1.8b, v4.8b
492 add v30.8h, v10.8h , v12.8h
493 mls v16.8h, v18.8h , v24.8h
494 ld1 {v0.2s, v1.2s}, [x8], x2
495 ext v5.8b, v0.8b , v1.8b , #5
496 ext v2.8b, v0.8b , v1.8b , #2
497 ext v3.8b, v0.8b , v1.8b , #3
498 uaddl v20.8h, v0.8b, v5.8b
499
500 st1 {v16.4s}, [x10], x6
501
502 saddl v18.4s, v6.4h, v16.4h
503
504 ld1 {v26.4s}, [x11], x6
505
506 saddl2 v6.4s, v6.8h, v16.8h
507
508 sqrshrun v26.8b, v26.8h, #5
509
510 smlal v18.4s, v30.4h, v22.4h
511 smlsl v18.4s, v28.4h, v24.4h
512 smlal2 v6.4s, v30.8h, v22.8h
513 smlsl2 v6.4s, v28.8h, v24.8h
514 uaddl v2.8h, v2.8b, v3.8b
515 ext v4.8b, v0.8b , v1.8b , #4
516 mla v20.8h, v2.8h , v22.8h
517 sqrshrun v18.4h, v18.4s, #10
518 ext v1.8b, v0.8b , v1.8b , #1
519 sqrshrun v19.4h, v6.4s, #10
520 add v28.8h, v10.8h , v16.8h
521 uaddl v2.8h, v1.8b, v4.8b
522 add v30.8h, v12.8h , v14.8h
523 mls v20.8h, v2.8h , v24.8h
524 uqxtn v18.8b, v18.8h
525 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300526 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530527 ld1 {v0.2s, v1.2s}, [x8], x2
528
529 urhadd v26.8b, v18.8b , v26.8b
530
531 ext v5.8b, v0.8b , v1.8b , #5
532 ext v2.8b, v0.8b , v1.8b , #2
533
534 st1 {v20.4s}, [x10], x6
535
536 saddl v18.4s, v8.4h, v20.4h
537 saddl2 v6.4s, v8.8h, v20.8h
538
539 ld1 {v8.4s}, [x11], x6
540
541
542 st1 {v26.2s}, [x14], x3 //store row 0
543
544 smlal v18.4s, v30.4h, v22.4h
545 smlsl v18.4s, v28.4h, v24.4h
546 smlal2 v6.4s, v30.8h, v22.8h
547 smlsl2 v6.4s, v28.8h, v24.8h
548 sqrshrun v28.8b, v8.8h, #5
549 ext v3.8b, v0.8b , v1.8b , #3
550 uaddl v8.8h, v0.8b, v5.8b
551 uaddl v2.8h, v2.8b, v3.8b
552 sqrshrun v18.4h, v18.4s, #10
553 ext v4.8b, v0.8b , v1.8b , #4
554 sqrshrun v19.4h, v6.4s, #10
555 mla v8.8h, v2.8h , v22.8h
556 ext v1.8b, v0.8b , v1.8b , #1
557 add v26.8h, v12.8h , v20.8h
558 uaddl v2.8h, v1.8b, v4.8b
559 uqxtn v18.8b, v18.8h
560 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300561 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530562 add v30.8h, v14.8h , v16.8h
563 mls v8.8h, v2.8h , v24.8h
564 ld1 {v0.2s, v1.2s}, [x8], x2
565
566 urhadd v28.8b, v28.8b , v18.8b
567
568 ext v5.8b, v0.8b , v1.8b , #5
569 ext v2.8b, v0.8b , v1.8b , #2
570 ext v3.8b, v0.8b , v1.8b , #3
571
572 st1 {v28.2s}, [x14], x3 //store row 1
573
574 uaddl v28.8h, v0.8b, v5.8b
575
576 st1 {v8.4s}, [x10], x6
577
578 saddl v18.4s, v10.4h, v8.4h
579 saddl2 v6.4s, v10.8h, v8.8h
580
581 ld1 {v10.4s}, [x11], x6
582
583 smlal v18.4s, v30.4h, v22.4h
584 smlsl v18.4s, v26.4h, v24.4h
585 smlal2 v6.4s, v30.8h, v22.8h
586 smlsl2 v6.4s, v26.8h, v24.8h
587
588 sqrshrun v26.8b, v10.8h, #5
589 uaddl v2.8h, v2.8b, v3.8b
590 ext v4.8b, v0.8b , v1.8b , #4
591 mla v28.8h, v2.8h , v22.8h
592 sqrshrun v18.4h, v18.4s, #10
593 ext v1.8b, v0.8b , v1.8b , #1
594 sqrshrun v19.4h, v6.4s, #10
595 add v10.8h, v14.8h , v8.8h
596 uaddl v2.8h, v1.8b, v4.8b
597 add v30.8h, v16.8h , v20.8h
598 mls v28.8h, v2.8h , v24.8h
599 uqxtn v27.8b, v18.8h
600 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300601 mov v27.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530602
603
604 saddl v18.4s, v12.4h, v28.4h
605 saddl2 v6.4s, v12.8h, v28.8h
606
607 urhadd v26.8b, v26.8b , v27.8b
608
609 smlal v18.4s, v30.4h, v22.4h
610 smlsl v18.4s, v10.4h, v24.4h
611 smlal2 v6.4s, v30.8h, v22.8h
612 smlsl2 v6.4s, v10.8h, v24.8h
613
614 st1 {v26.2s}, [x14], x3 // store row 2
615
616 st1 {v28.4s}, [x10]
617
618 sqrshrun v18.4h, v18.4s, #10
619 mov v10.16b, v20.16b
620 mov v11.16b, v21.16b
621 ld1 {v30.4s}, [x11], x6
622
623 sqrshrun v19.4h, v6.4s, #10
624 subs x12, x12, #4
625
626 sqrshrun v30.8b, v30.8h, #5
627
628 uqxtn v18.8b, v18.8h
629 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300630 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530631
632 mov v12.16b, v8.16b
633 mov v13.16b, v9.16b
634 mov v6.16b, v14.16b
635 mov v7.16b, v15.16b
636 urhadd v30.8b, v18.8b , v30.8b
637
638 mov v8.16b, v16.16b
639 mov v9.16b, v17.16b
640 mov v14.16b, v28.16b
641 mov v15.16b, v29.16b
642 st1 {v30.2s}, [x14], x3 // store row 3
643
644 bgt loop_16_highhalf // looping if height = 8 or 16
645 b end_func
646
647loop_8_start:
648
649 movi v22.8h, #0x14 // Filter coeff 20 into Q11
650 movi v24.8h, #5 // Filter coeff 5 into Q12
651 ld1 {v0.2s, v1.2s}, [x0], x2 // row -2 load for horizontal filter
652 ext v5.8b, v0.8b , v1.8b , #5
653 uaddl v6.8h, v0.8b, v5.8b
654
655 ext v2.8b, v0.8b , v1.8b , #2
656 ext v3.8b, v0.8b , v1.8b , #3
657 uaddl v8.8h, v2.8b, v3.8b
658 ext v4.8b, v0.8b , v1.8b , #4
659 mla v6.8h, v8.8h , v22.8h
660 ext v1.8b, v0.8b , v1.8b , #1
661 uaddl v8.8h, v1.8b, v4.8b
662 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load for horizontal filter
663 mls v6.8h, v8.8h , v24.8h
664 ext v5.8b, v0.8b , v1.8b , #5
665 uaddl v8.8h, v0.8b, v5.8b
666 ext v2.8b, v0.8b , v1.8b , #2
667 ext v3.8b, v0.8b , v1.8b , #3
668 uaddl v10.8h, v2.8b, v3.8b
669
670 st1 {v6.4s}, [x9], x6 // store temp buffer 0
671
672 ext v4.8b, v0.8b , v1.8b , #4
673 mla v8.8h, v10.8h , v22.8h
674 ext v1.8b, v0.8b , v1.8b , #1
675 uaddl v10.8h, v1.8b, v4.8b
676 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load for horizontal filter
677 mls v8.8h, v10.8h , v24.8h
678 ext v5.8b, v0.8b , v1.8b , #5
679 uaddl v10.8h, v0.8b, v5.8b
680 ext v2.8b, v0.8b , v1.8b , #2
681 ext v3.8b, v0.8b , v1.8b , #3
682 uaddl v12.8h, v2.8b, v3.8b
683
684 st1 {v8.4s}, [x9], x6 // store temp buffer 1
685
686 ext v4.8b, v0.8b , v1.8b , #4
687 mla v10.8h, v12.8h , v22.8h
688 ext v1.8b, v0.8b , v1.8b , #1
689 uaddl v12.8h, v1.8b, v4.8b
690 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load for horizontal filter
691 mls v10.8h, v12.8h , v24.8h
692 ext v5.8b, v0.8b , v1.8b , #5
693 uaddl v12.8h, v0.8b, v5.8b
694 ext v2.8b, v0.8b , v1.8b , #2
695 ext v3.8b, v0.8b , v1.8b , #3
696 uaddl v14.8h, v2.8b, v3.8b
697
698 st1 {v10.4s}, [x9], x6 // store temp buffer 2
699
700 ext v4.8b, v0.8b , v1.8b , #4
701 mla v12.8h, v14.8h , v22.8h
702 ext v1.8b, v0.8b , v1.8b , #1
703 uaddl v14.8h, v1.8b, v4.8b
704 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load for horizontal filter
705 mls v12.8h, v14.8h , v24.8h
706 ext v5.8b, v0.8b , v1.8b , #5
707 uaddl v14.8h, v0.8b, v5.8b
708 ext v2.8b, v0.8b , v1.8b , #2
709 ext v3.8b, v0.8b , v1.8b , #3
710 uaddl v16.8h, v2.8b, v3.8b
711
712 st1 {v12.4s}, [x9], x6 // store temp buffer 3
713
714 ext v4.8b, v0.8b , v1.8b , #4
715 mla v14.8h, v16.8h , v22.8h
716 ext v1.8b, v0.8b , v1.8b , #1
717 uaddl v16.8h, v1.8b, v4.8b
718
719 mls v14.8h, v16.8h , v24.8h
720loop_8:
721
722 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load for horizontal filter
723 ext v5.8b, v0.8b , v1.8b , #5
724 ext v2.8b, v0.8b , v1.8b , #2
725 ext v3.8b, v0.8b , v1.8b , #3
726 uaddl v16.8h, v0.8b, v5.8b
727
728 st1 {v14.4s}, [x9], x6 // store temp buffer 4
729
730 uaddl v18.8h, v2.8b, v3.8b
731 ext v4.8b, v0.8b , v1.8b , #4
732 mla v16.8h, v18.8h , v22.8h
733 ext v1.8b, v0.8b , v1.8b , #1
734 add v28.8h, v8.8h , v14.8h
735 uaddl v18.8h, v1.8b, v4.8b
736 add v30.8h, v10.8h , v12.8h
737 mls v16.8h, v18.8h , v24.8h
738 ld1 {v0.2s, v1.2s} , [x0], x2 // row 4 load for hoorizontal filter
739 ext v5.8b, v0.8b , v1.8b , #5
740 ext v2.8b, v0.8b , v1.8b , #2
741 ext v3.8b, v0.8b , v1.8b , #3
742 uaddl v20.8h, v0.8b, v5.8b
743
744 st1 {v16.4s}, [x9], x6 // store temp buffer x5
745
746 saddl v18.4s, v6.4h, v16.4h
747
748 ld1 {v26.4s}, [x7], x6 // load from temp buffer 0
749
750 saddl2 v6.4s, v6.8h, v16.8h
751
752 sqrshrun v26.8b, v26.8h, #5
753
754 smlal v18.4s, v30.4h, v22.4h
755 smlsl v18.4s, v28.4h, v24.4h
756 smlal2 v6.4s, v30.8h, v22.8h
757 smlsl2 v6.4s, v28.8h, v24.8h
758 uaddl v2.8h, v2.8b, v3.8b
759 ext v4.8b, v0.8b , v1.8b , #4
760 mla v20.8h, v2.8h , v22.8h
761 sqrshrun v18.4h, v18.4s, #10
762 ext v1.8b, v0.8b , v1.8b , #1
763 sqrshrun v19.4h, v6.4s, #10
764 add v28.8h, v10.8h , v16.8h
765 uaddl v2.8h, v1.8b, v4.8b
766 add v30.8h, v12.8h , v14.8h
767 mls v20.8h, v2.8h , v24.8h
768
769 uqxtn v18.8b, v18.8h
770 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300771 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530772
773 ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter
774
775 urhadd v26.8b, v18.8b , v26.8b
776
777 ext v5.8b, v0.8b , v1.8b , #5
778 ext v2.8b, v0.8b , v1.8b , #2
779
780 st1 {v20.4s}, [x9], x6 // store temp buffer x6
781
782 saddl v18.4s, v8.4h, v20.4h
783
784 saddl2 v6.4s, v8.8h, v20.8h
785
786 ld1 {v8.4s}, [x7], x6 //load from temp buffer 1
787
788
789 st1 {v26.2s}, [x1], x3 // store row 0
790
791 smlal v18.4s, v30.4h, v22.4h
792 smlsl v18.4s, v28.4h, v24.4h
793
794
795
796 smlal2 v6.4s, v30.8h, v22.8h
797 smlsl2 v6.4s, v28.8h, v24.8h
798
799 sqrshrun v28.8b, v8.8h, #5
800
801 ext v3.8b, v0.8b , v1.8b , #3
802 uaddl v8.8h, v0.8b, v5.8b
803 uaddl v2.8h, v2.8b, v3.8b
804 sqrshrun v18.4h, v18.4s, #10
805 ext v4.8b, v0.8b , v1.8b , #4
806 sqrshrun v19.4h, v6.4s, #10
807 mla v8.8h, v2.8h , v22.8h
808 ext v1.8b, v0.8b , v1.8b , #1
809 add v26.8h, v12.8h , v20.8h
810 uaddl v2.8h, v1.8b, v4.8b
811
812
813 uqxtn v18.8b, v18.8h
814 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300815 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530816
817 add v30.8h, v14.8h , v16.8h
818 mls v8.8h, v2.8h , v24.8h
819 ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter
820
821 urhadd v28.8b, v28.8b , v18.8b
822
823 ext v5.8b, v0.8b , v1.8b , #5
824 ext v2.8b, v0.8b , v1.8b , #2
825 ext v3.8b, v0.8b , v1.8b , #3
826
827 st1 {v28.2s}, [x1], x3 // store row 1
828
829 uaddl v28.8h, v0.8b, v5.8b
830
831 st1 {v8.4s}, [x9], x6 // store temp buffer x7
832
833 saddl v18.4s, v10.4h, v8.4h
834 saddl2 v6.4s, v10.8h, v8.8h
835
836 ld1 {v10.4s}, [x7], x6 // load from temp buffer 2
837
838 smlal v18.4s, v30.4h, v22.4h
839 smlsl v18.4s, v26.4h, v24.4h
840 smlal2 v6.4s, v30.8h, v22.8h
841 smlsl2 v6.4s, v26.8h, v24.8h
842
843 sqrshrun v26.8b, v10.8h, #5
844 uaddl v2.8h, v2.8b, v3.8b
845 ext v4.8b, v0.8b , v1.8b , #4
846 mla v28.8h, v2.8h , v22.8h
847 sqrshrun v18.4h, v18.4s, #10
848 ext v1.8b, v0.8b , v1.8b , #1
849 sqrshrun v19.4h, v6.4s, #10
850 add v10.8h, v14.8h , v8.8h
851 uaddl v2.8h, v1.8b, v4.8b
852 add v30.8h, v16.8h , v20.8h
853 mls v28.8h, v2.8h , v24.8h
854
855 uqxtn v27.8b, v18.8h
856 uqxtn v19.8b, v19.8h
857
Martin Storsjodb02f572015-06-10 12:05:14 +0300858 mov v27.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530859
860 saddl v18.4s, v12.4h, v28.4h
861 saddl2 v6.4s, v12.8h, v28.8h
862
863 urhadd v26.8b, v26.8b , v27.8b
864
865 smlal v18.4s, v30.4h, v22.4h
866 smlsl v18.4s, v10.4h, v24.4h
867 smlal2 v6.4s, v30.8h, v22.8h
868 smlsl2 v6.4s, v10.8h, v24.8h
869
870 st1 {v26.2s}, [x1], x3 // store row 2
871
872 st1 {v28.2s, v29.2s}, [x9]
873
874
875 sqrshrun v18.4h, v18.4s, #10
876 mov v10.16b, v20.16b
877 mov v11.16b, v21.16b
878 ld1 {v30.4s}, [x7], x6 // load from temp buffer 3
879
880 sqrshrun v19.4h, v6.4s, #10
881 subs x4, x4, #4
882
883 sqrshrun v30.8b, v30.8h, #5
884
885
886 uqxtn v18.8b, v18.8h
887 uqxtn v19.8b, v19.8h
Martin Storsjodb02f572015-06-10 12:05:14 +0300888 mov v18.s[1], v19.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +0530889
890
891 mov v12.16b, v8.16b
892 mov v13.16b, v9.16b
893 mov v6.16b, v14.16b
894 mov v7.16b, v15.16b
895
896 urhadd v30.8b, v18.8b , v30.8b
897 mov v8.16b, v16.16b
898 mov v9.16b, v17.16b
899 mov v14.16b, v28.16b
900 mov v15.16b, v29.16b
901 st1 {v30.2s}, [x1], x3 // store row 3
902
903 bgt loop_8 //if height =8 or 16 loop
904 b end_func
905
906loop_4_start:
907 movi v22.8h, #20 // Filter coeff 20 into D22
908 movi v23.8h, #5 // Filter coeff 5 into D23
909
910 ld1 {v0.2s, v1.2s}, [x0], x2 //row -2 load
911 ext v5.8b, v0.8b , v1.8b , #5
912 uaddl v6.8h, v0.8b, v5.8b
913 ext v2.8b, v0.8b , v1.8b , #2
914 ext v3.8b, v0.8b , v1.8b , #3
915 uaddl v8.8h, v2.8b, v3.8b
916 ext v4.8b, v0.8b , v1.8b , #4
917 mla v6.4h, v8.4h , v22.4h
918 ext v1.8b, v0.8b , v1.8b , #1
919 uaddl v8.8h, v1.8b, v4.8b
920 ld1 {v0.2s, v1.2s}, [x0], x2 // row -1 load
921 mls v6.4h, v8.4h , v23.4h
922 ext v5.8b, v0.8b , v1.8b , #5
923 uaddl v8.8h, v0.8b, v5.8b
924 ext v2.8b, v0.8b , v1.8b , #2
925 ext v3.8b, v0.8b , v1.8b , #3
926 uaddl v10.8h, v2.8b, v3.8b
927
928 st1 {v6.2s}, [x9], x6 // store temp buffer 0
929
930 ext v4.8b, v0.8b , v1.8b , #4
931 mla v8.4h, v10.4h , v22.4h
932 ext v1.8b, v0.8b , v1.8b , #1
933 uaddl v10.8h, v1.8b, v4.8b
934 ld1 {v0.2s, v1.2s}, [x0], x2 // row 0 load
935 mls v8.4h, v10.4h , v23.4h
936 ext v5.8b, v0.8b , v1.8b , #5
937 uaddl v10.8h, v0.8b, v5.8b
938 ext v2.8b, v0.8b , v1.8b , #2
939 ext v3.8b, v0.8b , v1.8b , #3
940 uaddl v12.8h, v2.8b, v3.8b
941
942 st1 {v8.2s}, [x9], x6 // store temp buffer 1
943
944 ext v4.8b, v0.8b , v1.8b , #4
945 mla v10.4h, v12.4h , v22.4h
946 ext v1.8b, v0.8b , v1.8b , #1
947 uaddl v12.8h, v1.8b, v4.8b
948 ld1 {v0.2s, v1.2s}, [x0], x2 // row 1 load
949 mls v10.4h, v12.4h , v23.4h
950 ext v5.8b, v0.8b , v1.8b , #5
951 uaddl v12.8h, v0.8b, v5.8b
952 ext v2.8b, v0.8b , v1.8b , #2
953 ext v3.8b, v0.8b , v1.8b , #3
954 uaddl v14.8h, v2.8b, v3.8b
955
956 st1 {v10.2s}, [x9], x6 // store temp buffer 2
957
958 ext v4.8b, v0.8b , v1.8b , #4
959 mla v12.4h, v14.4h , v22.4h
960 ext v1.8b, v0.8b , v1.8b , #1
961 uaddl v14.8h, v1.8b, v4.8b
962 ld1 {v0.2s, v1.2s}, [x0], x2 // row 2 load
963 mls v12.4h, v14.4h , v23.4h
964 ext v5.8b, v0.8b , v1.8b , #5
965 uaddl v14.8h, v0.8b, v5.8b
966 ext v2.8b, v0.8b , v1.8b , #2
967 ext v3.8b, v0.8b , v1.8b , #3
968 uaddl v16.8h, v2.8b, v3.8b
969 ext v4.8b, v0.8b , v1.8b , #4
970 mla v14.4h, v16.4h , v22.4h
971 ext v1.8b, v0.8b , v1.8b , #1
972 uaddl v16.8h, v1.8b, v4.8b
973
974 st1 {v12.2s}, [x9], x6 // store temp buffer 3
975
976 mls v14.4h, v16.4h , v23.4h
977
978loop_4:
979
980 ld1 {v0.2s, v1.2s}, [x0], x2 // row 3 load
981 ext v5.8b, v0.8b , v1.8b , #5
982 uaddl v16.8h, v0.8b, v5.8b
983 ext v2.8b, v0.8b , v1.8b , #2
984 ext v3.8b, v0.8b , v1.8b , #3
985 uaddl v18.8h, v2.8b, v3.8b
986 st1 {v14.2s}, [x9], x6 // store temp buffer 4
987 ext v4.8b, v0.8b , v1.8b , #4
988 mla v16.4h, v18.4h , v22.4h
989 ext v1.8b, v0.8b , v1.8b , #1
990 uaddl v18.8h, v1.8b, v4.8b
991 add v2.4h, v10.4h , v12.4h
992 mls v16.4h, v18.4h , v23.4h
993 add v3.4h, v8.4h , v14.4h
994 ld1 {v18.2s, v19.2s}, [x0], x2 // row 4 load
995 ext v25.8b, v18.8b , v19.8b , #5
996 uaddl v26.8h, v18.8b, v25.8b
997 ext v20.8b, v18.8b , v19.8b , #2
998
999 st1 {v16.2s}, [x9], x6 // store temp buffer 5
1000
1001 saddl v0.4s, v6.4h, v16.4h
1002 smlal v0.4s, v2.4h, v22.4h
1003 ext v21.8b, v18.8b , v19.8b , #3
1004 uaddl v28.8h, v20.8b, v21.8b
1005 ext v24.8b, v18.8b , v19.8b , #4
1006 smlsl v0.4s, v3.4h, v23.4h
1007 mla v26.4h, v28.4h , v22.4h
1008 ext v19.8b, v18.8b , v19.8b , #1
1009 uaddl v28.8h, v19.8b, v24.8b
1010 add v2.4h, v12.4h , v14.4h
1011 mls v26.4h, v28.4h , v23.4h
1012 sqrshrun v0.4h, v0.4s, #0xa
1013 add v3.4h, v10.4h , v16.4h
1014 ld1 {v18.2s, v19.2s}, [x0], x2 // row 5 load
1015 ext v25.8b, v18.8b , v19.8b , #5
1016 uqxtn v11.8b, v0.8h
1017 uaddl v28.8h, v18.8b, v25.8b
1018
1019 st1 {v26.2s}, [x9], x6 // store temp buffer 6
1020
1021 //Q3 available here
1022 ld1 {v6.2s}, [x7], x6 // load from temp buffer 0
1023 ld1 {v7.2s}, [x7], x6 // load from temp buffer 1
1024
1025 sqrshrun v9.8b, v6.8h, #5
1026 sqrshrun v7.8b, v7.8h, #5
Martin Storsjodb02f572015-06-10 12:05:14 +03001027 mov v9.s[1], v7.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301028
1029 ext v20.8b, v18.8b , v19.8b , #2
1030
1031 saddl v0.4s, v8.4h, v26.4h
1032 smlal v0.4s, v2.4h, v22.4h
1033 ext v21.8b, v18.8b , v19.8b , #3
1034 uaddl v6.8h, v20.8b, v21.8b
1035 ext v24.8b, v18.8b , v19.8b , #4
1036 smlsl v0.4s, v3.4h, v23.4h
1037 mla v28.4h, v6.4h , v22.4h
1038 ext v19.8b, v18.8b , v19.8b , #1
1039 uaddl v6.8h, v19.8b, v24.8b
1040 add v2.4h, v14.4h , v16.4h
1041 mls v28.4h, v6.4h , v23.4h
1042 sqrshrun v0.4h, v0.4s, #0xa
1043 add v3.4h, v12.4h , v26.4h
1044 ld1 {v18.2s, v19.2s}, [x0], x2 // row 6 load
1045 ext v25.8b, v18.8b , v19.8b , #5
1046 uqxtn v13.8b, v0.8h
1047
1048 trn1 v11.2s, v11.2s, v13.2s
1049 trn2 v13.2s, v11.2s, v13.2s
1050 saddl v0.4s, v10.4h, v28.4h
1051 urhadd v9.8b, v9.8b , v11.8b
1052
1053 st1 {v28.2s}, [x9], x6 // store temp buffer 7
1054
1055 smlal v0.4s, v2.4h, v22.4h
1056 uaddl v30.8h, v18.8b, v25.8b
1057
1058 st1 {v9.s}[0], [x1], x3 // store row 0
1059
1060 ext v20.8b, v18.8b , v19.8b , #2
1061
1062 st1 {v9.s}[1], [x1], x3 // store row 1
1063
1064 ext v21.8b, v18.8b , v19.8b , #3
1065 smlsl v0.4s, v3.4h, v23.4h
1066 uaddl v8.8h, v20.8b, v21.8b
1067 ext v24.8b, v18.8b , v19.8b , #4
1068 mla v30.4h, v8.4h , v22.4h
1069 ext v19.8b, v18.8b , v19.8b , #1
1070 uaddl v8.8h, v19.8b, v24.8b
1071 sqrshrun v0.4h, v0.4s, #0xa
1072 add v2.4h, v16.4h , v26.4h
1073 mls v30.4h, v8.4h , v23.4h
1074 uqxtn v4.8b, v0.8h
1075
1076 add v3.4h, v14.4h , v28.4h
1077
1078
1079 saddl v0.4s, v12.4h, v30.4h
1080
1081 st1 {v30.2s}, [x9]
1082
1083 smlal v0.4s, v2.4h, v22.4h
1084
1085 ld1 {v8.2s}, [x7], x6 // load from temp buffer 2
1086 ld1 {v9.2s}, [x7], x6 // load from temp buffer 3
1087 smlsl v0.4s, v3.4h, v23.4h
1088 subs x4, x4, #4
1089
1090 sqrshrun v10.8b, v8.8h, #5
1091 sqrshrun v9.8b, v9.8h, #5
Martin Storsjodb02f572015-06-10 12:05:14 +03001092 mov v10.s[1], v9.s[0]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301093
1094 mov v12.8b, v28.8b
1095
1096 sqrshrun v0.4h, v0.4s, #0xa
1097 mov v6.8b, v14.8b
1098 mov v8.8b, v16.8b
1099
1100 uqxtn v5.8b, v0.8h
1101
1102 trn1 v4.2s, v4.2s, v5.2s
1103 trn2 v5.2s, v4.2s, v5.2s
1104 urhadd v4.8b, v4.8b , v10.8b
1105 mov v10.8b, v26.8b
1106 mov v14.8b, v30.8b
1107
1108 st1 {v4.s}[0], [x1], x3 // store row 2
1109 st1 {v4.s}[1], [x1], x3 // store row 3
1110
1111 bgt loop_4
1112
1113end_func:
Harish Mahendrakar25e8adb2015-04-20 15:33:05 +05301114 //Restoring registers from stack
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301115 ldp x19, x20, [sp], #16
1116 pop_v_regs
1117 ret
1118
1119
1120