blob: 6f4074746a4d68f5f7bba436004bc15a47463dbf [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// * ihevc_itrans_recon_8x8_neon.s
22// *
23// * @brief
24// * contains function definitions for single stage inverse transform
25// *
26// * @author
27// * anand s
28// *
29// * @par list of functions:
30// * - ihevc_itrans_recon_32x32()
31// *
32// * @remarks
33// * the input buffer is being corrupted
34// *
35// *******************************************************************************
36//*/
37
38///**
39// *******************************************************************************
40// *
41// * @brief
42// * this function performs inverse transform and reconstruction for 8x8
43// * input block
44// *
45// * @par description:
46// * performs inverse transform and adds the prediction data and clips output
47// * to 8 bit
48// *
49// * @param[in] pi2_src
50// * input 16x16 coefficients
51// *
52// * @param[in] pi2_tmp
53// * temporary 16x16 buffer for storing inverse
54// *
55// * transform
56// * 1st stage output
57// *
58// * @param[in] pu1_pred
59// * prediction 16x16 block
60// *
61// * @param[out] pu1_dst
62// * output 8x8 block
63// *
64// * @param[in] src_strd
65// * input stride
66// *
67// * @param[in] pred_strd
68// * prediction stride
69// *
70// * @param[in] dst_strd
71// * output stride
72// *
73// * @param[in] shift
74// * output shift
75// *
76// * @param[in] x12
77// * zero columns in pi2_src
78// *
79// * @returns void
80// *
81// * @remarks
82// * none
83// *
84// *******************************************************************************
85// */
86
87//void ihevc_itrans_recon_32x32(word16 *pi2_src,
88// word16 *pi2_tmp,
89// uword8 *pu1_pred,
90// uword8 *pu1_dst,
91// word32 src_strd,
92// word32 pred_strd,
93// word32 dst_strd,
94// word32 x12
95// word32 x11 )
96
97//**************variables vs registers*************************
98// x0 => *pi2_src
99// x1 => *pi2_tmp
100// x2 => *pu1_pred
101// x3 => *pu1_dst
102// src_strd
103// pred_strd
104// dst_strd
105// x12
106// x11
107
108
109//d0[0]= 64 d2[0]=83
110//d0[1]= 90 d2[1]=82
111//d0[2]= 90 d2[2]=80
112//d0[3]= 90 d2[3]=78
113//d1[0]= 89 d3[0]=75
114//d1[1]= 88 d3[1]=73
115//d1[2]= 87 d3[2]=70
116//d1[3]= 85 d3[3]=67
117
118//d4[0]= 64 d6[0]=36
119//d4[1]= 61 d6[1]=31
120//d4[2]= 57 d6[2]=25
121//d4[3]= 54 d6[3]=22
122//d5[0]= 50 d7[0]=18
123//d5[1]= 46 d7[1]=13
124//d5[2]= 43 d7[2]=9
125//d5[3]= 38 d7[3]=4
126
127.text
128.align 4
129.include "ihevc_neon_macros.s"
130
131
132
133
134.set shift_stage1_idct , 7
135.set shift_stage2_idct , 12
136
137//#define zero_cols x12
138//#define zero_rows x11
139
140.globl ihevc_itrans_recon_32x32_av8
141
142.extern g_ai2_ihevc_trans_32_transpose
143
144x5_addr: .word 0xfffff000
145x9_addr: .word 0xffff0000
146
147.type ihevc_itrans_recon_32x32_av8, %function
148
149ihevc_itrans_recon_32x32_av8:
150
151 ldr w11, [sp]
152
153// stmfd sp!,{x0-x12,x14}
154 push_v_regs
155 stp x19, x20,[sp,#-16]!
156 stp x0, x1,[sp,#-16]!
157 stp x5, x6,[sp,#-16]!
158
159//ldr x8,[sp,#56] @ prediction stride
160//ldr x7,[sp,#64] @ destination stride
161 mov x6, x4 // src stride
162 mov x12, x7
163 lsl x6, x6, #1 // x sizeof(word16)
164 add x10,x6,x6, lsl #1 // 3 rows
165
166
167 mov x8,x0
168
169 adrp x14, :got:g_ai2_ihevc_trans_32_transpose
170 ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
171
172 ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
173 ld1 {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
174
175//registers which are free
176// x10,x9,x11,x12
177 mov x9,#0xffffff00
178 mov x10,#0xfffffff0
179 ldr w5, x5_addr
180 ldr w7, x9_addr
181 cmp x12,x10
182 mov x20,#1
183 csel x14, x20, x14,hs
184 bhs stage1
185
186
187 cmp x12,x9
188 mov x20,#2
189 csel x14, x20, x14,hs
190 bhs stage1
191
192 cmp x12,x5
193 mov x20,#3
194 csel x14, x20, x14,hs
195 bhs stage1
196
197 cmp x12,x7
198 mov x20,#4
199 csel x14, x20, x14,hs
200
201 mov x14,#8
202 b stage1
203//.ltorg
204
205
206dct_stage1:
207 add x8,x8,#8
208 mov x0,x8
209
210stage1:
211 ld1 {v10.4h},[x0],x6
212 ld1 {v8.4h},[x0],x6
213 ld1 {v11.4h},[x0],x6
214 ld1 {v9.4h},[x0],x6
215
216 smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
217 smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
218 smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
219 smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
220
221 smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
222 smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
223 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
224 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
225
226
227
228
229
230 smull v20.4s, v10.4h, v0.4h[0]
231 smlal v20.4s, v11.4h, v0.4h[2]
232
233
234 smull v22.4s, v10.4h, v0.4h[0]
235 smlal v22.4s, v11.4h, v1.4h[2]
236
237 smull v16.4s, v10.4h, v0.4h[0]
238 smlal v16.4s, v11.4h, v2.4h[2]
239
240 smull v18.4s, v10.4h, v0.4h[0]
241 smlal v18.4s, v11.4h, v3.4h[2]
242 cmp x11,x10
243 bhs shift1
244
245 ld1 {v12.4h},[x0],x6
246 ld1 {v14.4h},[x0],x6
247 ld1 {v13.4h},[x0],x6
248 ld1 {v15.4h},[x0],x6
249
250
251
252
253
254
255
256 smlal v24.4s, v14.4h, v1.4h[1]
257 smlal v26.4s, v14.4h, v3.4h[3]
258 smlal v28.4s, v14.4h, v6.4h[1]
259 smlsl v30.4s, v14.4h, v7.4h[1]
260
261
262 smlal v24.4s, v15.4h, v1.4h[3]
263 smlal v26.4s, v15.4h, v5.4h[1]
264 smlsl v28.4s, v15.4h, v7.4h[1]
265 smlsl v30.4s, v15.4h, v3.4h[3]
266
267
268 smlal v20.4s, v12.4h, v1.4h[0]
269 smlal v20.4s, v13.4h, v1.4h[2]
270 smlal v22.4s, v12.4h, v3.4h[0]
271 smlal v22.4s, v13.4h, v4.4h[2]
272 smlal v16.4s, v12.4h, v5.4h[0]
273 smlal v16.4s, v13.4h, v7.4h[2]
274 smlal v18.4s, v12.4h, v7.4h[0]
275 smlsl v18.4s, v13.4h, v5.4h[2]
276
277 cmp x11,x9
278 bhs shift1
279
280 ld1 {v10.4h},[x0],x6
281 ld1 {v8.4h},[x0],x6
282 ld1 {v11.4h},[x0],x6
283 ld1 {v9.4h},[x0],x6
284
285
286 smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
287 smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
288 smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
289 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
290
291 smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
292 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
293 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
294 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
295
296
297
298
299
300 smlal v20.4s, v10.4h, v2.4h[0]
301 smlal v20.4s, v11.4h, v2.4h[2]
302
303
304 smlal v22.4s, v10.4h, v6.4h[0]
305 smlal v22.4s, v11.4h, v7.4h[2]
306
307 smlsl v16.4s, v10.4h, v6.4h[0]
308 smlsl v16.4s, v11.4h, v3.4h[2]
309
310 smlsl v18.4s, v10.4h, v2.4h[0]
311 smlsl v18.4s, v11.4h, v1.4h[2]
312
313 cmp x11,x5
314 bhs shift1
315
316
317 ld1 {v12.4h},[x0],x6
318 ld1 {v14.4h},[x0],x6
319 ld1 {v13.4h},[x0],x6
320 ld1 {v15.4h},[x0],x6
321
322
323
324
325
326
327
328
329
330 smlal v24.4s, v14.4h, v3.4h[1]
331 smlsl v26.4s, v14.4h, v6.4h[1]
332 smlsl v28.4s, v14.4h, v0.4h[1]
333 smlsl v30.4s, v14.4h, v6.4h[3]
334
335
336 smlal v24.4s, v15.4h, v3.4h[3]
337 smlsl v26.4s, v15.4h, v4.4h[3]
338 smlsl v28.4s, v15.4h, v2.4h[3]
339 smlal v30.4s, v15.4h, v5.4h[3]
340
341
342 smlal v20.4s, v12.4h, v3.4h[0]
343 smlal v20.4s, v13.4h, v3.4h[2]
344 smlsl v22.4s, v12.4h, v7.4h[0]
345 smlsl v22.4s, v13.4h, v5.4h[2]
346 smlsl v16.4s, v12.4h, v1.4h[0]
347 smlsl v16.4s, v13.4h, v1.4h[2]
348 smlsl v18.4s, v12.4h, v5.4h[0]
349 smlal v18.4s, v13.4h, v7.4h[2]
350
351 cmp x11,x7
352 bhs shift1
353
354
355 ld1 {v10.4h},[x0],x6
356 ld1 {v8.4h},[x0],x6
357 ld1 {v11.4h},[x0],x6
358 ld1 {v9.4h},[x0],x6
359
360
361
362 smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
363 smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
364 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
365 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
366
367 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
368 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
369 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
370 smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
371
372
373
374
375
376 smlal v20.4s, v10.4h, v0.4h[0]
377 smlal v20.4s, v11.4h, v4.4h[2]
378
379
380 smlsl v22.4s, v10.4h, v0.4h[0]
381 smlsl v22.4s, v11.4h, v2.4h[2]
382
383 smlsl v16.4s, v10.4h, v0.4h[0]
384 smlsl v16.4s, v11.4h, v6.4h[2]
385
386 smlal v18.4s, v10.4h, v0.4h[0]
387 smlal v18.4s, v11.4h, v0.4h[2]
388
389
390
391 ld1 {v12.4h},[x0],x6
392 ld1 {v14.4h},[x0],x6
393 ld1 {v13.4h},[x0],x6
394 ld1 {v15.4h},[x0],x6
395
396
397
398
399 smlal v24.4s, v14.4h, v5.4h[1]
400 smlsl v26.4s, v14.4h, v0.4h[2]
401 smlal v28.4s, v14.4h, v5.4h[3]
402 smlal v30.4s, v14.4h, v4.4h[3]
403
404
405 smlal v24.4s, v15.4h, v5.4h[3]
406 smlsl v26.4s, v15.4h, v1.4h[1]
407 smlal v28.4s, v15.4h, v3.4h[1]
408 smlsl v30.4s, v15.4h, v7.4h[3]
409
410
411 smlal v20.4s, v12.4h, v5.4h[0]
412 smlal v20.4s, v13.4h, v5.4h[2]
413 smlsl v22.4s, v12.4h, v1.4h[0]
414 smlsl v22.4s, v13.4h, v0.4h[2]
415 smlal v16.4s, v12.4h, v7.4h[0]
416 smlal v16.4s, v13.4h, v4.4h[2]
417 smlal v18.4s, v12.4h, v3.4h[0]
418 smlal v18.4s, v13.4h, v6.4h[2]
419
420
421 ld1 {v10.4h},[x0],x6
422 ld1 {v8.4h},[x0],x6
423 ld1 {v11.4h},[x0],x6
424 ld1 {v9.4h},[x0],x6
425
426
427
428
429
430
431
432 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
433 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
434 smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
435 smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
436
437 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
438 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
439 smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
440 smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
441
442
443
444
445
446 smlal v20.4s, v10.4h, v6.4h[0]
447 smlal v20.4s, v11.4h, v6.4h[2]
448
449
450 smlsl v22.4s, v10.4h, v2.4h[0]
451 smlsl v22.4s, v11.4h, v3.4h[2]
452
453 smlal v16.4s, v10.4h, v2.4h[0]
454 smlal v16.4s, v11.4h, v0.4h[2]
455
456 smlsl v18.4s, v10.4h, v6.4h[0]
457 smlsl v18.4s, v11.4h, v2.4h[2]
458
459 ld1 {v12.4h},[x0],x6
460 ld1 {v14.4h},[x0],x6
461 ld1 {v13.4h},[x0],x6
462 ld1 {v15.4h},[x0],x6
463
464
465 smlal v24.4s, v14.4h, v7.4h[1]
466 smlsl v26.4s, v14.4h, v5.4h[3]
467 smlal v28.4s, v14.4h, v4.4h[1]
468 smlsl v30.4s, v14.4h, v2.4h[3]
469
470
471 smlal v24.4s, v15.4h, v7.4h[3]
472 smlsl v26.4s, v15.4h, v7.4h[1]
473 smlal v28.4s, v15.4h, v6.4h[3]
474 smlsl v30.4s, v15.4h, v6.4h[1]
475
476
477 smlal v20.4s, v12.4h, v7.4h[0]
478 smlal v20.4s, v13.4h, v7.4h[2]
479 smlsl v22.4s, v12.4h, v5.4h[0]
480 smlsl v22.4s, v13.4h, v6.4h[2]
481 smlal v16.4s, v12.4h, v3.4h[0]
482 smlal v16.4s, v13.4h, v5.4h[2]
483 smlsl v18.4s, v12.4h, v1.4h[0]
484 smlsl v18.4s, v13.4h, v4.4h[2]
485
486
487
488shift1:
489 add v8.4s, v20.4s , v24.4s
490 sub v10.4s, v20.4s , v24.4s
491
492 add v12.4s, v22.4s , v26.4s
493 sub v24.4s, v22.4s , v26.4s
494
495 add v14.4s, v16.4s , v28.4s
496 sub v26.4s, v16.4s , v28.4s
497
498
499 add v16.4s, v18.4s , v30.4s
500 sub v28.4s, v18.4s , v30.4s
501
502
503 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
504 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
505 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
506 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
507 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
508 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
509 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
510 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
511
512
513 // registers used q15,q14,q6,q7
514
515 umov x15,v24.d[0]
516 umov x16,v25.d[0]
517 umov x19,v26.d[0]
518 umov x20,v27.d[0]
519
520 trn1 v24.4h, v30.4h, v12.4h
521 trn2 v25.4h, v30.4h, v12.4h
522 trn1 v26.4h, v31.4h, v13.4h
523 trn2 v27.4h, v31.4h, v13.4h
524
525 trn1 v30.2s, v24.2s, v26.2s
526 trn2 v31.2s, v24.2s, v26.2s
527 trn1 v12.2s, v25.2s, v27.2s
528 trn2 v13.2s, v25.2s, v27.2s
529
530 trn1 v24.4h, v14.4h, v18.4h
531 trn2 v25.4h, v14.4h, v18.4h
532 trn1 v26.4h, v15.4h, v19.4h
533 trn2 v27.4h, v15.4h, v19.4h
534
535 trn1 v14.2s, v24.2s, v26.2s
536 trn2 v15.2s, v24.2s, v26.2s
537 trn1 v18.2s, v25.2s, v27.2s
538 trn2 v19.2s, v25.2s, v27.2s
539
540 mov v24.d[0],x15
541 mov v25.d[0],x16
542 mov v26.d[0],x19
543 mov v27.d[0],x20
544
545// d30 =x0 1- 4 values
546// d31 =x2 1- 4 values
547// d12=x1 1- 4 values
548// d13=x3 1- 4 values
549// d14 =x0 28-31 values
550// d15 =x2 28- 31 values
551// d18=x1 28- 31 values
552// d19=x3 28- 31 values
553
554
555
556 st1 { v30.4h, v31.4h},[x1],#16
557 st1 { v12.4h, v13.4h},[x1],#16
558 add x1,x1,#192
559 st1 { v14.4h, v15.4h},[x1],#16
560 st1 { v18.4h, v19.4h},[x1],#16
561 sub x1,x1,#224
562
563 mov x0,x8
564
565
566
567
568
569 ld1 {v10.4h},[x0],x6
570 ld1 {v8.4h},[x0],x6
571 ld1 {v11.4h},[x0],x6
572 ld1 {v9.4h},[x0],x6
573
574
575
576
577 smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
578 smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
579 smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
580 smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
581
582 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
583 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
584 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
585 smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
586
587
588
589
590
591 smull v20.4s, v10.4h, v0.4h[0]
592 smlal v20.4s, v11.4h, v4.4h[2]
593
594
595 smull v22.4s, v10.4h, v0.4h[0]
596 smlal v22.4s, v11.4h, v5.4h[2]
597
598 smull v16.4s, v10.4h, v0.4h[0]
599 smlal v16.4s, v11.4h, v6.4h[2]
600
601 smull v18.4s, v10.4h, v0.4h[0]
602 smlal v18.4s, v11.4h, v7.4h[2]
603 cmp x11,x10
604 bhs shift2
605
606 ld1 {v12.4h},[x0],x6
607 ld1 {v14.4h},[x0],x6
608 ld1 {v13.4h},[x0],x6
609 ld1 {v15.4h},[x0],x6
610
611
612 smlsl v24.4s, v14.4h, v4.4h[3]
613 smlsl v26.4s, v14.4h, v2.4h[1]
614 smlsl v28.4s, v14.4h, v0.4h[1]
615 smlsl v30.4s, v14.4h, v2.4h[3]
616
617
618 smlsl v24.4s, v15.4h, v0.4h[3]
619 smlsl v26.4s, v15.4h, v3.4h[1]
620 smlsl v28.4s, v15.4h, v6.4h[3]
621 smlal v30.4s, v15.4h, v5.4h[3]
622
623
624 smlsl v20.4s, v12.4h, v7.4h[0]
625 smlsl v20.4s, v13.4h, v2.4h[2]
626 smlsl v22.4s, v12.4h, v5.4h[0]
627 smlsl v22.4s, v13.4h, v0.4h[2]
628 smlsl v16.4s, v12.4h, v3.4h[0]
629 smlsl v16.4s, v13.4h, v3.4h[2]
630 smlsl v18.4s, v12.4h, v1.4h[0]
631 smlsl v18.4s, v13.4h, v6.4h[2]
632
633 cmp x11,x9
634 bhs shift2
635
636
637 ld1 {v10.4h},[x0],x6
638 ld1 {v8.4h},[x0],x6
639 ld1 {v11.4h},[x0],x6
640 ld1 {v9.4h},[x0],x6
641
642
643
644
645
646
647
648 smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
649 smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
650 smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
651 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
652
653 smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
654 smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
655 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
656 smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
657
658
659
660
661
662 smlsl v20.4s, v10.4h, v2.4h[0]
663 smlsl v20.4s, v11.4h, v6.4h[2]
664
665
666 smlsl v22.4s, v10.4h, v6.4h[0]
667 smlal v22.4s, v11.4h, v4.4h[2]
668
669 smlal v16.4s, v10.4h, v6.4h[0]
670 smlal v16.4s, v11.4h, v0.4h[2]
671
672 smlal v18.4s, v10.4h, v2.4h[0]
673 smlal v18.4s, v11.4h, v5.4h[2]
674
675 cmp x11,x5
676 bhs shift2
677
678
679 ld1 {v12.4h},[x0],x6
680 ld1 {v14.4h},[x0],x6
681 ld1 {v13.4h},[x0],x6
682 ld1 {v15.4h},[x0],x6
683
684
685
686
687
688 smlal v24.4s, v14.4h, v2.4h[3]
689 smlal v26.4s, v14.4h, v3.4h[3]
690 smlsl v28.4s, v14.4h, v5.4h[3]
691 smlsl v30.4s, v14.4h, v0.4h[3]
692
693
694 smlal v24.4s, v15.4h, v1.4h[3]
695 smlsl v26.4s, v15.4h, v6.4h[3]
696 smlsl v28.4s, v15.4h, v0.4h[3]
697 smlal v30.4s, v15.4h, v7.4h[3]
698
699
700 smlal v20.4s, v12.4h, v5.4h[0]
701 smlal v20.4s, v13.4h, v0.4h[2]
702 smlal v22.4s, v12.4h, v1.4h[0]
703 smlal v22.4s, v13.4h, v6.4h[2]
704 smlal v16.4s, v12.4h, v7.4h[0]
705 smlsl v16.4s, v13.4h, v2.4h[2]
706 smlsl v18.4s, v12.4h, v3.4h[0]
707 smlsl v18.4s, v13.4h, v4.4h[2]
708
709
710 cmp x11,x7
711 bhs shift2
712
713
714 ld1 {v10.4h},[x0],x6
715 ld1 {v8.4h},[x0],x6
716 ld1 {v11.4h},[x0],x6
717 ld1 {v9.4h},[x0],x6
718
719
720
721
722
723
724
725 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
726 smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
727 smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
728 smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
729
730 smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
731 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
732 smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
733 smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
734
735
736
737
738
739 smlal v20.4s, v10.4h, v0.4h[0]
740 smlsl v20.4s, v11.4h, v7.4h[2]
741
742
743 smlsl v22.4s, v10.4h, v0.4h[0]
744 smlsl v22.4s, v11.4h, v1.4h[2]
745
746 smlsl v16.4s, v10.4h, v0.4h[0]
747 smlal v16.4s, v11.4h, v5.4h[2]
748
749 smlal v18.4s, v10.4h, v0.4h[0]
750 smlal v18.4s, v11.4h, v3.4h[2]
751
752
753
754 ld1 {v12.4h},[x0],x6
755 ld1 {v14.4h},[x0],x6
756 ld1 {v13.4h},[x0],x6
757 ld1 {v15.4h},[x0],x6
758
759
760 smlsl v24.4s, v14.4h, v0.4h[1]
761 smlal v26.4s, v14.4h, v6.4h[1]
762 smlal v28.4s, v14.4h, v4.4h[1]
763 smlsl v30.4s, v14.4h, v1.4h[1]
764
765
766 smlsl v24.4s, v15.4h, v3.4h[3]
767 smlal v26.4s, v15.4h, v0.4h[1]
768 smlsl v28.4s, v15.4h, v5.4h[1]
769 smlsl v30.4s, v15.4h, v6.4h[1]
770
771
772 smlsl v20.4s, v12.4h, v3.4h[0]
773 smlsl v20.4s, v13.4h, v1.4h[2]
774 smlsl v22.4s, v12.4h, v7.4h[0]
775 smlal v22.4s, v13.4h, v3.4h[2]
776 smlal v16.4s, v12.4h, v1.4h[0]
777 smlal v16.4s, v13.4h, v7.4h[2]
778 smlsl v18.4s, v12.4h, v5.4h[0]
779 smlsl v18.4s, v13.4h, v2.4h[2]
780
781 ld1 {v10.4h},[x0],x6
782 ld1 {v8.4h},[x0],x6
783 ld1 {v11.4h},[x0],x6
784 ld1 {v9.4h},[x0],x6
785
786
787
788
789 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
790 smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
791 smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
792 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
793
794 smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
795 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
796 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
797 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
798
799
800
801
802
803 smlsl v20.4s, v10.4h, v6.4h[0]
804 smlal v20.4s, v11.4h, v5.4h[2]
805
806
807 smlal v22.4s, v10.4h, v2.4h[0]
808 smlal v22.4s, v11.4h, v7.4h[2]
809
810 smlsl v16.4s, v10.4h, v2.4h[0]
811 smlsl v16.4s, v11.4h, v4.4h[2]
812
813 smlal v18.4s, v10.4h, v6.4h[0]
814 smlal v18.4s, v11.4h, v1.4h[2]
815
816
817 ld1 {v12.4h},[x0],x6
818 ld1 {v14.4h},[x0],x6
819 ld1 {v13.4h},[x0],x6
820 ld1 {v15.4h},[x0],x6
821
822
823
824
825
826 smlal v24.4s, v14.4h, v1.4h[1]
827 smlsl v26.4s, v14.4h, v0.4h[3]
828 smlal v28.4s, v14.4h, v1.4h[3]
829 smlsl v30.4s, v14.4h, v3.4h[1]
830
831
832 smlal v24.4s, v15.4h, v5.4h[3]
833 smlsl v26.4s, v15.4h, v5.4h[1]
834 smlal v28.4s, v15.4h, v4.4h[3]
835 smlsl v30.4s, v15.4h, v4.4h[1]
836
837
838 smlal v20.4s, v12.4h, v1.4h[0]
839 smlal v20.4s, v13.4h, v3.4h[2]
840 smlsl v22.4s, v12.4h, v3.4h[0]
841 smlsl v22.4s, v13.4h, v2.4h[2]
842 smlal v16.4s, v12.4h, v5.4h[0]
843 smlal v16.4s, v13.4h, v1.4h[2]
844 smlsl v18.4s, v12.4h, v7.4h[0]
845 smlsl v18.4s, v13.4h, v0.4h[2]
846
847shift2:
848 add v8.4s, v20.4s , v24.4s
849 sub v10.4s, v20.4s , v24.4s
850
851 add v12.4s, v22.4s , v26.4s
852 sub v24.4s, v22.4s , v26.4s
853
854 add v14.4s, v16.4s , v28.4s
855 sub v26.4s, v16.4s , v28.4s
856
857
858 add v16.4s, v18.4s , v30.4s
859 sub v28.4s, v18.4s , v30.4s
860
861
862 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
863 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
864 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
865 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
866 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
867 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
868 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
869 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
870
871 umov x15,v24.d[0]
872 umov x16,v25.d[0]
873 umov x19,v26.d[0]
874 umov x20,v27.d[0]
875
876 trn1 v24.4h, v30.4h, v12.4h
877 trn2 v25.4h, v30.4h, v12.4h
878 trn1 v26.4h, v31.4h, v13.4h
879 trn2 v27.4h, v31.4h, v13.4h
880
881 trn1 v30.2s, v24.2s, v26.2s
882 trn2 v31.2s, v24.2s, v26.2s
883 trn1 v12.2s, v25.2s, v27.2s
884 trn2 v13.2s, v25.2s, v27.2s
885
886 trn1 v24.4h, v14.4h, v18.4h
887 trn2 v25.4h, v14.4h, v18.4h
888 trn1 v26.4h, v15.4h, v19.4h
889 trn2 v27.4h, v15.4h, v19.4h
890
891 trn1 v14.2s, v24.2s, v26.2s
892 trn2 v15.2s, v24.2s, v26.2s
893 trn1 v18.2s, v25.2s, v27.2s
894 trn2 v19.2s, v25.2s, v27.2s
895
896 mov v24.d[0],x15
897 mov v25.d[0],x16
898 mov v26.d[0],x19
899 mov v27.d[0],x20
900
901 st1 { v30.4h, v31.4h},[x1],#16
902 st1 { v12.4h, v13.4h},[x1],#16
903 add x1,x1,#128
904 st1 { v14.4h, v15.4h},[x1],#16
905 st1 { v18.4h, v19.4h},[x1],#16
906 sub x1,x1,#160
907 mov x0,x8
908
909
910
911 ld1 {v10.4h},[x0],x6
912 ld1 {v8.4h},[x0],x6
913 ld1 {v11.4h},[x0],x6
914 ld1 {v9.4h},[x0],x6
915
916
917 smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
918 smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
919 smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
920 smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
921
922 smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
923 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
924 smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
925 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
926
927
928
929
930
931 smull v20.4s, v10.4h, v0.4h[0]
932 smlsl v20.4s, v11.4h, v7.4h[2]
933
934
935 smull v22.4s, v10.4h, v0.4h[0]
936 smlsl v22.4s, v11.4h, v6.4h[2]
937
938 smull v16.4s, v10.4h, v0.4h[0]
939 smlsl v16.4s, v11.4h, v5.4h[2]
940
941 smull v18.4s, v10.4h, v0.4h[0]
942 smlsl v18.4s, v11.4h, v4.4h[2]
943
944 cmp x11,x10
945 bhs shift3
946
947 ld1 {v12.4h},[x0],x6
948 ld1 {v14.4h},[x0],x6
949 ld1 {v13.4h},[x0],x6
950 ld1 {v15.4h},[x0],x6
951
952
953
954
955 smlsl v24.4s, v14.4h, v5.4h[1]
956 smlsl v26.4s, v14.4h, v7.4h[3]
957 smlal v28.4s, v14.4h, v5.4h[3]
958 smlal v30.4s, v14.4h, v3.4h[1]
959
960
961 smlal v24.4s, v15.4h, v2.4h[1]
962 smlal v26.4s, v15.4h, v1.4h[1]
963 smlal v28.4s, v15.4h, v4.4h[3]
964 smlsl v30.4s, v15.4h, v7.4h[3]
965
966
967 smlsl v20.4s, v12.4h, v1.4h[0]
968 smlal v20.4s, v13.4h, v6.4h[2]
969 smlsl v22.4s, v12.4h, v3.4h[0]
970 smlal v22.4s, v13.4h, v3.4h[2]
971 smlsl v16.4s, v12.4h, v5.4h[0]
972 smlal v16.4s, v13.4h, v0.4h[2]
973 smlsl v18.4s, v12.4h, v7.4h[0]
974 smlal v18.4s, v13.4h, v2.4h[2]
975
976 cmp x11,x9
977 bhs shift3
978
979 ld1 {v10.4h},[x0],x6
980 ld1 {v8.4h},[x0],x6
981 ld1 {v11.4h},[x0],x6
982 ld1 {v9.4h},[x0],x6
983
984 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
985 smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
986 smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
987 smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
988
989 smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
990 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
991 smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
992 smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
993
994
995
996
997
998 smlal v20.4s, v10.4h, v2.4h[0]
999 smlsl v20.4s, v11.4h, v5.4h[2]
1000
1001
1002 smlal v22.4s, v10.4h, v6.4h[0]
1003 smlsl v22.4s, v11.4h, v0.4h[2]
1004
1005 smlsl v16.4s, v10.4h, v6.4h[0]
1006 smlsl v16.4s, v11.4h, v4.4h[2]
1007
1008 smlsl v18.4s, v10.4h, v2.4h[0]
1009 smlal v18.4s, v11.4h, v6.4h[2]
1010
1011 cmp x11,x5
1012 bhs shift3
1013
1014
1015 ld1 {v12.4h},[x0],x6
1016 ld1 {v14.4h},[x0],x6
1017 ld1 {v13.4h},[x0],x6
1018 ld1 {v15.4h},[x0],x6
1019
1020
1021
1022
1023
1024
1025 smlsl v24.4s, v14.4h, v7.4h[1]
1026 smlal v26.4s, v14.4h, v2.4h[1]
1027 smlal v28.4s, v14.4h, v4.4h[1]
1028 smlsl v30.4s, v14.4h, v5.4h[1]
1029
1030
1031 smlal v24.4s, v15.4h, v0.4h[3]
1032 smlal v26.4s, v15.4h, v7.4h[1]
1033 smlsl v28.4s, v15.4h, v1.4h[1]
1034 smlsl v30.4s, v15.4h, v6.4h[1]
1035
1036
1037 smlsl v20.4s, v12.4h, v3.4h[0]
1038 smlal v20.4s, v13.4h, v4.4h[2]
1039 smlal v22.4s, v12.4h, v7.4h[0]
1040 smlal v22.4s, v13.4h, v2.4h[2]
1041 smlal v16.4s, v12.4h, v1.4h[0]
1042 smlsl v16.4s, v13.4h, v6.4h[2]
1043 smlal v18.4s, v12.4h, v5.4h[0]
1044 smlsl v18.4s, v13.4h, v0.4h[2]
1045
1046
1047 cmp x11,x7
1048 bhs shift3
1049
1050
1051 ld1 {v10.4h},[x0],x6
1052 ld1 {v8.4h},[x0],x6
1053 ld1 {v11.4h},[x0],x6
1054 ld1 {v9.4h},[x0],x6
1055
1056
1057 smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
1058 smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
1059 smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
1060 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
1061
1062 smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1063 smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1064 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1065 smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1066
1067
1068
1069
1070
1071 smlal v20.4s, v10.4h, v0.4h[0]
1072 smlsl v20.4s, v11.4h, v3.4h[2]
1073
1074
1075 smlsl v22.4s, v10.4h, v0.4h[0]
1076 smlsl v22.4s, v11.4h, v5.4h[2]
1077
1078 smlsl v16.4s, v10.4h, v0.4h[0]
1079 smlal v16.4s, v11.4h, v1.4h[2]
1080
1081 smlal v18.4s, v10.4h, v0.4h[0]
1082 smlal v18.4s, v11.4h, v7.4h[2]
1083
1084
1085 ld1 {v12.4h},[x0],x6
1086 ld1 {v14.4h},[x0],x6
1087 ld1 {v13.4h},[x0],x6
1088 ld1 {v15.4h},[x0],x6
1089
1090
1091
1092 smlal v24.4s, v14.4h, v6.4h[3]
1093 smlal v26.4s, v14.4h, v3.4h[3]
1094 smlsl v28.4s, v14.4h, v1.4h[3]
1095 smlal v30.4s, v14.4h, v7.4h[1]
1096
1097
1098 smlal v24.4s, v15.4h, v1.4h[3]
1099 smlsl v26.4s, v15.4h, v2.4h[3]
1100 smlal v28.4s, v15.4h, v7.4h[1]
1101 smlal v30.4s, v15.4h, v4.4h[1]
1102
1103
1104 smlsl v20.4s, v12.4h, v5.4h[0]
1105 smlal v20.4s, v13.4h, v2.4h[2]
1106 smlal v22.4s, v12.4h, v1.4h[0]
1107 smlsl v22.4s, v13.4h, v7.4h[2]
1108 smlsl v16.4s, v12.4h, v7.4h[0]
1109 smlsl v16.4s, v13.4h, v3.4h[2]
1110 smlsl v18.4s, v12.4h, v3.4h[0]
1111 smlal v18.4s, v13.4h, v1.4h[2]
1112
1113
1114
1115 ld1 {v10.4h},[x0],x6
1116 ld1 {v8.4h},[x0],x6
1117 ld1 {v11.4h},[x0],x6
1118 ld1 {v9.4h},[x0],x6
1119
1120
1121
1122
1123 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
1124 smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
1125 smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
1126 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
1127
1128 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1129 smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1130 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1131 smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1132
1133
1134
1135
1136
1137 smlal v20.4s, v10.4h, v6.4h[0]
1138 smlsl v20.4s, v11.4h, v1.4h[2]
1139
1140
1141 smlsl v22.4s, v10.4h, v2.4h[0]
1142 smlal v22.4s, v11.4h, v4.4h[2]
1143
1144 smlal v16.4s, v10.4h, v2.4h[0]
1145 smlsl v16.4s, v11.4h, v7.4h[2]
1146
1147 smlsl v18.4s, v10.4h, v6.4h[0]
1148 smlsl v18.4s, v11.4h, v5.4h[2]
1149
1150
1151 ld1 {v12.4h},[x0],x6
1152 ld1 {v14.4h},[x0],x6
1153 ld1 {v13.4h},[x0],x6
1154 ld1 {v15.4h},[x0],x6
1155
1156 smlal v24.4s, v14.4h, v4.4h[3]
1157 smlsl v26.4s, v14.4h, v6.4h[1]
1158 smlal v28.4s, v14.4h, v7.4h[3]
1159 smlal v30.4s, v14.4h, v6.4h[3]
1160
1161
1162 smlal v24.4s, v15.4h, v3.4h[3]
1163 smlsl v26.4s, v15.4h, v3.4h[1]
1164 smlal v28.4s, v15.4h, v2.4h[3]
1165 smlsl v30.4s, v15.4h, v2.4h[1]
1166
1167
1168 smlsl v20.4s, v12.4h, v7.4h[0]
1169 smlal v20.4s, v13.4h, v0.4h[2]
1170 smlal v22.4s, v12.4h, v5.4h[0]
1171 smlsl v22.4s, v13.4h, v1.4h[2]
1172 smlsl v16.4s, v12.4h, v3.4h[0]
1173 smlal v16.4s, v13.4h, v2.4h[2]
1174 smlal v18.4s, v12.4h, v1.4h[0]
1175 smlsl v18.4s, v13.4h, v3.4h[2]
1176
1177shift3:
1178 add v8.4s, v20.4s , v24.4s
1179 sub v10.4s, v20.4s , v24.4s
1180
1181 add v12.4s, v22.4s , v26.4s
1182 sub v24.4s, v22.4s , v26.4s
1183
1184 add v14.4s, v16.4s , v28.4s
1185 sub v26.4s, v16.4s , v28.4s
1186
1187
1188 add v16.4s, v18.4s , v30.4s
1189 sub v28.4s, v18.4s , v30.4s
1190
1191
1192 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1193 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1194 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1195 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1196 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1197 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1198 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1199 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1200
1201 umov x15,v24.d[0]
1202 umov x16,v25.d[0]
1203 umov x19,v26.d[0]
1204 umov x20,v27.d[0]
1205
1206 trn1 v24.4h, v30.4h, v12.4h
1207 trn2 v25.4h, v30.4h, v12.4h
1208 trn1 v26.4h, v31.4h, v13.4h
1209 trn2 v27.4h, v31.4h, v13.4h
1210
1211 trn1 v30.2s, v24.2s, v26.2s
1212 trn2 v31.2s, v24.2s, v26.2s
1213 trn1 v12.2s, v25.2s, v27.2s
1214 trn2 v13.2s, v25.2s, v27.2s
1215
1216 trn1 v24.4h, v14.4h, v18.4h
1217 trn2 v25.4h, v14.4h, v18.4h
1218 trn1 v26.4h, v15.4h, v19.4h
1219 trn2 v27.4h, v15.4h, v19.4h
1220
1221 trn1 v14.2s, v24.2s, v26.2s
1222 trn2 v15.2s, v24.2s, v26.2s
1223 trn1 v18.2s, v25.2s, v27.2s
1224 trn2 v19.2s, v25.2s, v27.2s
1225
1226 mov v24.d[0],x15
1227 mov v25.d[0],x16
1228 mov v26.d[0],x19
1229 mov v27.d[0],x20
1230 st1 { v30.4h, v31.4h},[x1],#16
1231 st1 { v12.4h, v13.4h},[x1],#16
1232 add x1,x1,#64
1233 st1 { v14.4h, v15.4h},[x1],#16
1234 st1 { v18.4h, v19.4h},[x1],#16
1235 sub x1,x1,#96
1236
1237 mov x0,x8
1238
1239
1240
1241 ld1 {v10.4h},[x0],x6
1242 ld1 {v8.4h},[x0],x6
1243 ld1 {v11.4h},[x0],x6
1244 ld1 {v9.4h},[x0],x6
1245
1246
1247 smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
1248 smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
1249 smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
1250 smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
1251
1252 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1253 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1254 smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1255 smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1256
1257
1258
1259
1260
1261 smull v20.4s, v10.4h, v0.4h[0]
1262 smlsl v20.4s, v11.4h, v3.4h[2]
1263
1264
1265 smull v22.4s, v10.4h, v0.4h[0]
1266 smlsl v22.4s, v11.4h, v2.4h[2]
1267
1268 smull v16.4s, v10.4h, v0.4h[0]
1269 smlsl v16.4s, v11.4h, v1.4h[2]
1270
1271 smull v18.4s, v10.4h, v0.4h[0]
1272 smlsl v18.4s, v11.4h, v0.4h[2]
1273
1274 cmp x11,x10
1275 bhs shift4
1276
1277 ld1 {v12.4h},[x0],x6
1278 ld1 {v14.4h},[x0],x6
1279 ld1 {v13.4h},[x0],x6
1280 ld1 {v15.4h},[x0],x6
1281
1282
1283
1284
1285
1286
1287 smlal v24.4s, v14.4h, v0.4h[1]
1288 smlal v26.4s, v14.4h, v1.4h[3]
1289 smlal v28.4s, v14.4h, v4.4h[1]
1290 smlal v30.4s, v14.4h, v6.4h[3]
1291
1292
1293 smlsl v24.4s, v15.4h, v4.4h[1]
1294 smlsl v26.4s, v15.4h, v0.4h[3]
1295 smlsl v28.4s, v15.4h, v2.4h[3]
1296 smlsl v30.4s, v15.4h, v6.4h[1]
1297
1298
1299 smlal v20.4s, v12.4h, v7.4h[0]
1300 smlal v20.4s, v13.4h, v5.4h[2]
1301 smlal v22.4s, v12.4h, v5.4h[0]
1302 smlsl v22.4s, v13.4h, v7.4h[2]
1303 smlal v16.4s, v12.4h, v3.4h[0]
1304 smlsl v16.4s, v13.4h, v4.4h[2]
1305 smlal v18.4s, v12.4h, v1.4h[0]
1306 smlsl v18.4s, v13.4h, v1.4h[2]
1307
1308 cmp x11,x9
1309 bhs shift4
1310
1311 ld1 {v10.4h},[x0],x6
1312 ld1 {v8.4h},[x0],x6
1313 ld1 {v11.4h},[x0],x6
1314 ld1 {v9.4h},[x0],x6
1315
1316
1317
1318 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
1319 smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
1320 smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
1321 smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
1322
1323 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1324 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1325 smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1326 smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1327
1328
1329
1330
1331
1332 smlsl v20.4s, v10.4h, v2.4h[0]
1333 smlal v20.4s, v11.4h, v1.4h[2]
1334
1335
1336 smlsl v22.4s, v10.4h, v6.4h[0]
1337 smlal v22.4s, v11.4h, v3.4h[2]
1338
1339 smlal v16.4s, v10.4h, v6.4h[0]
1340 smlsl v16.4s, v11.4h, v7.4h[2]
1341
1342 smlal v18.4s, v10.4h, v2.4h[0]
1343 smlsl v18.4s, v11.4h, v2.4h[2]
1344
1345 cmp x11,x5
1346 bhs shift4
1347
1348
1349 ld1 {v12.4h},[x0],x6
1350 ld1 {v14.4h},[x0],x6
1351 ld1 {v13.4h},[x0],x6
1352 ld1 {v15.4h},[x0],x6
1353
1354
1355
1356
1357
1358
1359 smlsl v24.4s, v14.4h, v1.4h[1]
1360 smlsl v26.4s, v14.4h, v7.4h[3]
1361 smlal v28.4s, v14.4h, v1.4h[3]
1362 smlal v30.4s, v14.4h, v4.4h[3]
1363
1364
1365 smlal v24.4s, v15.4h, v2.4h[1]
1366 smlal v26.4s, v15.4h, v5.4h[1]
1367 smlsl v28.4s, v15.4h, v3.4h[1]
1368 smlsl v30.4s, v15.4h, v4.4h[1]
1369
1370
1371 smlsl v20.4s, v12.4h, v5.4h[0]
1372 smlsl v20.4s, v13.4h, v7.4h[2]
1373 smlsl v22.4s, v12.4h, v1.4h[0]
1374 smlal v22.4s, v13.4h, v1.4h[2]
1375 smlsl v16.4s, v12.4h, v7.4h[0]
1376 smlal v16.4s, v13.4h, v5.4h[2]
1377 smlal v18.4s, v12.4h, v3.4h[0]
1378 smlsl v18.4s, v13.4h, v3.4h[2]
1379
1380 cmp x11,x7
1381 bhs shift4
1382
1383
1384 ld1 {v10.4h},[x0],x6
1385 ld1 {v8.4h},[x0],x6
1386 ld1 {v11.4h},[x0],x6
1387 ld1 {v9.4h},[x0],x6
1388
1389
1390 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
1391 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
1392 smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
1393 smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
1394
1395 smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1396 smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1397 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1398 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1399
1400
1401
1402
1403
1404 smlal v20.4s, v10.4h, v0.4h[0]
1405 smlsl v20.4s, v11.4h, v0.4h[2]
1406
1407
1408 smlsl v22.4s, v10.4h, v0.4h[0]
1409 smlal v22.4s, v11.4h, v6.4h[2]
1410
1411 smlsl v16.4s, v10.4h, v0.4h[0]
1412 smlal v16.4s, v11.4h, v2.4h[2]
1413
1414 smlal v18.4s, v10.4h, v0.4h[0]
1415 smlsl v18.4s, v11.4h, v4.4h[2]
1416
1417
1418
1419
1420 ld1 {v12.4h},[x0],x6
1421 ld1 {v14.4h},[x0],x6
1422 ld1 {v13.4h},[x0],x6
1423 ld1 {v15.4h},[x0],x6
1424
1425
1426
1427
1428
1429
1430 smlal v24.4s, v14.4h, v3.4h[1]
1431 smlsl v26.4s, v14.4h, v2.4h[1]
1432 smlal v28.4s, v14.4h, v7.4h[3]
1433 smlal v30.4s, v14.4h, v2.4h[3]
1434
1435
1436 smlsl v24.4s, v15.4h, v0.4h[3]
1437 smlal v26.4s, v15.4h, v4.4h[3]
1438 smlal v28.4s, v15.4h, v6.4h[3]
1439 smlsl v30.4s, v15.4h, v2.4h[1]
1440
1441
1442 smlal v20.4s, v12.4h, v3.4h[0]
1443 smlsl v20.4s, v13.4h, v6.4h[2]
1444 smlal v22.4s, v12.4h, v7.4h[0]
1445 smlsl v22.4s, v13.4h, v4.4h[2]
1446 smlsl v16.4s, v12.4h, v1.4h[0]
1447 smlal v16.4s, v13.4h, v0.4h[2]
1448 smlal v18.4s, v12.4h, v5.4h[0]
1449 smlsl v18.4s, v13.4h, v5.4h[2]
1450
1451
1452 ld1 {v10.4h},[x0],x6
1453 ld1 {v8.4h},[x0],x6
1454 ld1 {v11.4h},[x0],x6
1455 ld1 {v9.4h},[x0],x6
1456
1457
1458
1459
1460
1461 smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
1462 smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
1463 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
1464 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
1465
1466 smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
1467 smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1468 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1469 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1470
1471
1472
1473
1474
1475 smlsl v20.4s, v10.4h, v6.4h[0]
1476 smlal v20.4s, v11.4h, v2.4h[2]
1477
1478
1479 smlal v22.4s, v10.4h, v2.4h[0]
1480 smlsl v22.4s, v11.4h, v0.4h[2]
1481
1482 smlsl v16.4s, v10.4h, v2.4h[0]
1483 smlal v16.4s, v11.4h, v3.4h[2]
1484
1485 smlal v18.4s, v10.4h, v6.4h[0]
1486 smlsl v18.4s, v11.4h, v6.4h[2]
1487
1488
1489 ld1 {v12.4h},[x0],x6
1490 ld1 {v14.4h},[x0],x6
1491 ld1 {v13.4h},[x0],x6
1492 ld1 {v15.4h},[x0],x6
1493
1494
1495
1496
1497 smlsl v24.4s, v14.4h, v5.4h[1]
1498 smlal v26.4s, v14.4h, v3.4h[3]
1499 smlsl v28.4s, v14.4h, v2.4h[1]
1500 smlal v30.4s, v14.4h, v0.4h[3]
1501
1502
1503 smlal v24.4s, v15.4h, v1.4h[3]
1504 smlsl v26.4s, v15.4h, v1.4h[1]
1505 smlal v28.4s, v15.4h, v0.4h[3]
1506 smlsl v30.4s, v15.4h, v0.4h[1]
1507
1508
1509 smlsl v20.4s, v12.4h, v1.4h[0]
1510 smlal v20.4s, v13.4h, v4.4h[2]
1511 smlal v22.4s, v12.4h, v3.4h[0]
1512 smlsl v22.4s, v13.4h, v5.4h[2]
1513 smlsl v16.4s, v12.4h, v5.4h[0]
1514 smlal v16.4s, v13.4h, v6.4h[2]
1515 smlal v18.4s, v12.4h, v7.4h[0]
1516 smlsl v18.4s, v13.4h, v7.4h[2]
1517
1518shift4:
1519 add v8.4s, v20.4s , v24.4s
1520 sub v10.4s, v20.4s , v24.4s
1521
1522 add v12.4s, v22.4s , v26.4s
1523 sub v24.4s, v22.4s , v26.4s
1524
1525 add v14.4s, v16.4s , v28.4s
1526 sub v26.4s, v16.4s , v28.4s
1527
1528
1529 add v16.4s, v18.4s , v30.4s
1530 sub v28.4s, v18.4s , v30.4s
1531
1532
1533 sqrshrn v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1534 sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1535 sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1536 sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1537 sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1538 sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1539 sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1540 sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1541
1542 umov x15,v24.d[0]
1543 umov x16,v25.d[0]
1544 umov x19,v26.d[0]
1545 umov x20,v27.d[0]
1546
1547 trn1 v24.4h, v30.4h, v12.4h
1548 trn2 v25.4h, v30.4h, v12.4h
1549 trn1 v26.4h, v31.4h, v13.4h
1550 trn2 v27.4h, v31.4h, v13.4h
1551
1552 trn1 v30.2s, v24.2s, v26.2s
1553 trn2 v31.2s, v24.2s, v26.2s
1554 trn1 v12.2s, v25.2s, v27.2s
1555 trn2 v13.2s, v25.2s, v27.2s
1556
1557 trn1 v24.4h, v14.4h, v18.4h
1558 trn2 v25.4h, v14.4h, v18.4h
1559 trn1 v26.4h, v15.4h, v19.4h
1560 trn2 v27.4h, v15.4h, v19.4h
1561
1562 trn1 v14.2s, v24.2s, v26.2s
1563 trn2 v15.2s, v24.2s, v26.2s
1564 trn1 v18.2s, v25.2s, v27.2s
1565 trn2 v19.2s, v25.2s, v27.2s
1566
1567 mov v24.d[0],x15
1568 mov v25.d[0],x16
1569 mov v26.d[0],x19
1570 mov v27.d[0],x20
1571
1572 st1 { v30.4h, v31.4h},[x1],#16
1573 st1 { v12.4h, v13.4h},[x1],#16
1574 st1 { v14.4h, v15.4h},[x1],#16
1575 st1 { v18.4h, v19.4h},[x1],#16
1576
1577 add x1,x1,#96
1578
1579 subs x14,x14,#1
1580 bne dct_stage1
1581second_stage_dct:
1582// mov x0,x1
1583 ldp x8, x7,[sp],#16
1584 ldp x0, x1,[sp],#16
1585
1586// add x4,x2,x8, lsl #1 @ x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
1587// add x5,x8,x8, lsl #1 @
1588// sub x0,x0,#512
1589 mov x11,#0xfffffff0
1590 mov x5, #0xffffff00
1591 ldr w6, x5_addr
1592 ldr w9, x9_addr
1593// sub x1,x1,#2048
1594 mov x4,x1
1595 mov x10,#240
1596 mov x14,#8
1597 b stage2
1598
1599// registers free :
1600
1601// arm registers used
1602// x8 : predicition stride
1603// x7 : destination stride
1604// x1: temp buffer
1605// x2 : pred buffer
1606// x3 : destination buffer
1607// x14 : loop counter
1608//x0 : scratch buffer
1609//x10 : used as stride
1610// x4 : used to store the initial address
1611//x12 : zero cols
1612// x11 : 0xfffffff0
1613// x5 : 0xffffff00
1614dct_stage2:
1615 add x4,x4,#32
1616 mov x1,x4
1617stage2:
1618 ld1 {v10.4h, v11.4h},[x1],#16
1619 ld1 {v8.4h, v9.4h},[x1],x10
1620
1621 smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
1622 smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
1623 smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
1624 smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
1625
1626 smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1627 smlal v26.4s, v9.4h, v2.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1628 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1629 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1630
1631
1632
1633 smull v20.4s, v10.4h, v0.4h[0]
1634 smlal v20.4s, v11.4h, v0.4h[2]
1635
1636
1637 smull v22.4s, v10.4h, v0.4h[0]
1638 smlal v22.4s, v11.4h, v1.4h[2]
1639
1640 smull v16.4s, v10.4h, v0.4h[0]
1641 smlal v16.4s, v11.4h, v2.4h[2]
1642
1643 smull v18.4s, v10.4h, v0.4h[0]
1644 smlal v18.4s, v11.4h, v3.4h[2]
1645 cmp x12,x11
1646 bhs stage2_shift1
1647
1648 ld1 {v12.4h, v13.4h},[x1],#16
1649 ld1 {v14.4h, v15.4h},[x1],x10
1650
1651
1652
1653
1654
1655
1656 smlal v24.4s, v14.4h, v1.4h[1]
1657 smlal v26.4s, v14.4h, v3.4h[3]
1658 smlal v28.4s, v14.4h, v6.4h[1]
1659 smlsl v30.4s, v14.4h, v7.4h[1]
1660
1661
1662 smlal v24.4s, v15.4h, v1.4h[3]
1663 smlal v26.4s, v15.4h, v5.4h[1]
1664 smlsl v28.4s, v15.4h, v7.4h[1]
1665 smlsl v30.4s, v15.4h, v3.4h[3]
1666
1667
1668 smlal v20.4s, v12.4h, v1.4h[0]
1669 smlal v20.4s, v13.4h, v1.4h[2]
1670 smlal v22.4s, v12.4h, v3.4h[0]
1671 smlal v22.4s, v13.4h, v4.4h[2]
1672 smlal v16.4s, v12.4h, v5.4h[0]
1673 smlal v16.4s, v13.4h, v7.4h[2]
1674 smlal v18.4s, v12.4h, v7.4h[0]
1675 smlsl v18.4s, v13.4h, v5.4h[2]
1676 cmp x12,x5
1677 bhs stage2_shift1
1678
1679 ld1 {v10.4h, v11.4h},[x1],#16
1680 ld1 {v8.4h, v9.4h},[x1],x10
1681
1682 smlal v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
1683 smlal v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
1684 smlsl v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
1685 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
1686
1687 smlal v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1688 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1689 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1690 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1691
1692
1693
1694
1695
1696 smlal v20.4s, v10.4h, v2.4h[0]
1697 smlal v20.4s, v11.4h, v2.4h[2]
1698
1699
1700 smlal v22.4s, v10.4h, v6.4h[0]
1701 smlal v22.4s, v11.4h, v7.4h[2]
1702
1703 smlsl v16.4s, v10.4h, v6.4h[0]
1704 smlsl v16.4s, v11.4h, v3.4h[2]
1705
1706 smlsl v18.4s, v10.4h, v2.4h[0]
1707 smlsl v18.4s, v11.4h, v1.4h[2]
1708
1709 cmp x12,x6
1710 bhs stage2_shift1
1711
1712
1713 ld1 {v12.4h, v13.4h},[x1],#16
1714 ld1 {v14.4h, v15.4h},[x1],x10
1715
1716
1717
1718
1719
1720 smlal v24.4s, v14.4h, v3.4h[1]
1721 smlsl v26.4s, v14.4h, v6.4h[1]
1722 smlsl v28.4s, v14.4h, v0.4h[1]
1723 smlsl v30.4s, v14.4h, v6.4h[3]
1724
1725
1726 smlal v24.4s, v15.4h, v3.4h[3]
1727 smlsl v26.4s, v15.4h, v4.4h[3]
1728 smlsl v28.4s, v15.4h, v2.4h[3]
1729 smlal v30.4s, v15.4h, v5.4h[3]
1730
1731
1732 smlal v20.4s, v12.4h, v3.4h[0]
1733 smlal v20.4s, v13.4h, v3.4h[2]
1734 smlsl v22.4s, v12.4h, v7.4h[0]
1735 smlsl v22.4s, v13.4h, v5.4h[2]
1736 smlsl v16.4s, v12.4h, v1.4h[0]
1737 smlsl v16.4s, v13.4h, v1.4h[2]
1738 smlsl v18.4s, v12.4h, v5.4h[0]
1739 smlal v18.4s, v13.4h, v7.4h[2]
1740
1741 cmp x12,x9
1742 bhs stage2_shift1
1743
1744
1745 ld1 {v10.4h, v11.4h},[x1],#16
1746 ld1 {v8.4h, v9.4h},[x1],x10
1747
1748
1749 smlal v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
1750 smlsl v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
1751 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
1752 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
1753
1754 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1755 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1756 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1757 smlal v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1758
1759
1760
1761
1762
1763 smlal v20.4s, v10.4h, v0.4h[0]
1764 smlal v20.4s, v11.4h, v4.4h[2]
1765
1766
1767 smlsl v22.4s, v10.4h, v0.4h[0]
1768 smlsl v22.4s, v11.4h, v2.4h[2]
1769
1770 smlsl v16.4s, v10.4h, v0.4h[0]
1771 smlsl v16.4s, v11.4h, v6.4h[2]
1772
1773 smlal v18.4s, v10.4h, v0.4h[0]
1774 smlal v18.4s, v11.4h, v0.4h[2]
1775
1776 ld1 {v12.4h, v13.4h},[x1],#16
1777 ld1 {v14.4h, v15.4h},[x1],x10
1778
1779
1780
1781
1782
1783 smlal v24.4s, v14.4h, v5.4h[1]
1784 smlsl v26.4s, v14.4h, v0.4h[2]
1785 smlal v28.4s, v14.4h, v5.4h[3]
1786 smlal v30.4s, v14.4h, v4.4h[3]
1787
1788
1789 smlal v24.4s, v15.4h, v5.4h[3]
1790 smlsl v26.4s, v15.4h, v1.4h[1]
1791 smlal v28.4s, v15.4h, v3.4h[1]
1792 smlsl v30.4s, v15.4h, v7.4h[3]
1793
1794
1795 smlal v20.4s, v12.4h, v5.4h[0]
1796 smlal v20.4s, v13.4h, v5.4h[2]
1797 smlsl v22.4s, v12.4h, v1.4h[0]
1798 smlsl v22.4s, v13.4h, v0.4h[2]
1799 smlal v16.4s, v12.4h, v7.4h[0]
1800 smlal v16.4s, v13.4h, v4.4h[2]
1801 smlal v18.4s, v12.4h, v3.4h[0]
1802 smlal v18.4s, v13.4h, v6.4h[2]
1803
1804
1805 ld1 {v10.4h, v11.4h},[x1],#16
1806 ld1 {v8.4h, v9.4h},[x1],x10
1807
1808
1809
1810
1811 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
1812 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
1813 smlal v28.4s, v8.4h, v0.4h[1] //// y1 * sin3(part of b2)
1814 smlsl v30.4s, v8.4h, v4.4h[1] //// y1 * sin1(part of b3)
1815
1816 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1817 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
1818 smlal v28.4s, v9.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
1819 smlsl v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1820
1821
1822
1823
1824
1825 smlal v20.4s, v10.4h, v6.4h[0]
1826 smlal v20.4s, v11.4h, v6.4h[2]
1827
1828
1829 smlsl v22.4s, v10.4h, v2.4h[0]
1830 smlsl v22.4s, v11.4h, v3.4h[2]
1831
1832 smlal v16.4s, v10.4h, v2.4h[0]
1833 smlal v16.4s, v11.4h, v0.4h[2]
1834
1835 smlsl v18.4s, v10.4h, v6.4h[0]
1836 smlsl v18.4s, v11.4h, v2.4h[2]
1837
1838 ld1 {v12.4h, v13.4h},[x1],#16
1839 ld1 {v14.4h, v15.4h},[x1],x10
1840
1841 smlal v24.4s, v14.4h, v7.4h[1]
1842 smlsl v26.4s, v14.4h, v5.4h[3]
1843 smlal v28.4s, v14.4h, v4.4h[1]
1844 smlsl v30.4s, v14.4h, v2.4h[3]
1845
1846
1847 smlal v24.4s, v15.4h, v7.4h[3]
1848 smlsl v26.4s, v15.4h, v7.4h[1]
1849 smlal v28.4s, v15.4h, v6.4h[3]
1850 smlsl v30.4s, v15.4h, v6.4h[1]
1851
1852
1853 smlal v20.4s, v12.4h, v7.4h[0]
1854 smlal v20.4s, v13.4h, v7.4h[2]
1855 smlsl v22.4s, v12.4h, v5.4h[0]
1856 smlsl v22.4s, v13.4h, v6.4h[2]
1857 smlal v16.4s, v12.4h, v3.4h[0]
1858 smlal v16.4s, v13.4h, v5.4h[2]
1859 smlsl v18.4s, v12.4h, v1.4h[0]
1860 smlsl v18.4s, v13.4h, v4.4h[2]
1861
1862stage2_shift1:
1863 add v8.4s, v20.4s , v24.4s
1864 sub v10.4s, v20.4s , v24.4s
1865
1866 add v12.4s, v22.4s , v26.4s
1867 sub v24.4s, v22.4s , v26.4s
1868
1869 add v14.4s, v16.4s , v28.4s
1870 sub v26.4s, v16.4s , v28.4s
1871
1872
1873 add v16.4s, v18.4s , v30.4s
1874 sub v28.4s, v18.4s , v30.4s
1875
1876
1877 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
1878 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
1879 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
1880 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
1881 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
1882 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
1883 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
1884 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
1885
1886
1887 umov x15,v24.d[0]
1888 umov x16,v25.d[0]
1889 umov x19,v26.d[0]
1890 umov x20,v27.d[0]
1891
1892 trn1 v24.4h, v30.4h, v12.4h
1893 trn2 v25.4h, v30.4h, v12.4h
1894 trn1 v26.4h, v31.4h, v13.4h
1895 trn2 v27.4h, v31.4h, v13.4h
1896
1897 trn1 v30.2s, v24.2s, v26.2s
1898 trn2 v31.2s, v24.2s, v26.2s
1899 trn1 v12.2s, v25.2s, v27.2s
1900 trn2 v13.2s, v25.2s, v27.2s
1901
1902 trn1 v24.4h, v14.4h, v18.4h
1903 trn2 v25.4h, v14.4h, v18.4h
1904 trn1 v26.4h, v15.4h, v19.4h
1905 trn2 v27.4h, v15.4h, v19.4h
1906
1907 trn1 v14.2s, v24.2s, v26.2s
1908 trn2 v15.2s, v24.2s, v26.2s
1909 trn1 v18.2s, v25.2s, v27.2s
1910 trn2 v19.2s, v25.2s, v27.2s
1911
1912 mov v24.d[0],x15
1913 mov v25.d[0],x16
1914 mov v26.d[0],x19
1915 mov v27.d[0],x20
1916
1917 st1 { v30.4h, v31.4h},[x0],#16
1918 st1 { v12.4h, v13.4h},[x0],#16
1919 st1 { v14.4h, v15.4h},[x0],#16
1920 st1 { v18.4h, v19.4h},[x0],#16
1921
1922 mov x1,x4
1923
1924
1925
1926
1927
1928
1929 ld1 {v10.4h, v11.4h},[x1],#16
1930 ld1 {v8.4h, v9.4h},[x1],x10
1931
1932
1933 smull v24.4s, v8.4h, v2.4h[1] //// y1 * cos1(part of b0)
1934 smull v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
1935 smull v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
1936 smull v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
1937
1938 smlal v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1939 smlsl v26.4s, v9.4h, v7.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1940 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1941 smlsl v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
1942
1943
1944
1945
1946
1947 smull v20.4s, v10.4h, v0.4h[0]
1948 smlal v20.4s, v11.4h, v4.4h[2]
1949
1950
1951 smull v22.4s, v10.4h, v0.4h[0]
1952 smlal v22.4s, v11.4h, v5.4h[2]
1953
1954 smull v16.4s, v10.4h, v0.4h[0]
1955 smlal v16.4s, v11.4h, v6.4h[2]
1956
1957 smull v18.4s, v10.4h, v0.4h[0]
1958 smlal v18.4s, v11.4h, v7.4h[2]
1959
1960 cmp x12,x11
1961 bhs stage2_shift2
1962
1963 ld1 {v12.4h, v13.4h},[x1],#16
1964 ld1 {v14.4h, v15.4h},[x1],x10
1965
1966
1967 smlsl v24.4s, v14.4h, v4.4h[3]
1968 smlsl v26.4s, v14.4h, v2.4h[1]
1969 smlsl v28.4s, v14.4h, v0.4h[1]
1970 smlsl v30.4s, v14.4h, v2.4h[3]
1971
1972
1973 smlsl v24.4s, v15.4h, v0.4h[3]
1974 smlsl v26.4s, v15.4h, v3.4h[1]
1975 smlsl v28.4s, v15.4h, v6.4h[3]
1976 smlal v30.4s, v15.4h, v5.4h[3]
1977
1978
1979 smlsl v20.4s, v12.4h, v7.4h[0]
1980 smlsl v20.4s, v13.4h, v2.4h[2]
1981 smlsl v22.4s, v12.4h, v5.4h[0]
1982 smlsl v22.4s, v13.4h, v0.4h[2]
1983 smlsl v16.4s, v12.4h, v3.4h[0]
1984 smlsl v16.4s, v13.4h, v3.4h[2]
1985 smlsl v18.4s, v12.4h, v1.4h[0]
1986 smlsl v18.4s, v13.4h, v6.4h[2]
1987
1988 cmp x12,x5
1989 bhs stage2_shift2
1990
1991 ld1 {v10.4h, v11.4h},[x1],#16
1992 ld1 {v8.4h, v9.4h},[x1],x10
1993
1994
1995
1996
1997
1998 smlsl v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
1999 smlal v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
2000 smlal v28.4s, v8.4h, v2.4h[3] //// y1 * sin3(part of b2)
2001 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
2002
2003 smlal v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2004 smlal v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2005 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2006 smlsl v30.4s, v9.4h, v6.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2007
2008
2009
2010
2011
2012 smlsl v20.4s, v10.4h, v2.4h[0]
2013 smlsl v20.4s, v11.4h, v6.4h[2]
2014
2015
2016 smlsl v22.4s, v10.4h, v6.4h[0]
2017 smlal v22.4s, v11.4h, v4.4h[2]
2018
2019 smlal v16.4s, v10.4h, v6.4h[0]
2020 smlal v16.4s, v11.4h, v0.4h[2]
2021
2022 smlal v18.4s, v10.4h, v2.4h[0]
2023 smlal v18.4s, v11.4h, v5.4h[2]
2024
2025 cmp x12,x6
2026 bhs stage2_shift2
2027
2028
2029 ld1 {v12.4h, v13.4h},[x1],#16
2030 ld1 {v14.4h, v15.4h},[x1],x10
2031
2032
2033
2034
2035
2036
2037 smlal v24.4s, v14.4h, v2.4h[3]
2038 smlal v26.4s, v14.4h, v3.4h[3]
2039 smlsl v28.4s, v14.4h, v5.4h[3]
2040 smlsl v30.4s, v14.4h, v0.4h[3]
2041
2042
2043 smlal v24.4s, v15.4h, v1.4h[3]
2044 smlsl v26.4s, v15.4h, v6.4h[3]
2045 smlsl v28.4s, v15.4h, v0.4h[3]
2046 smlal v30.4s, v15.4h, v7.4h[3]
2047
2048
2049 smlal v20.4s, v12.4h, v5.4h[0]
2050 smlal v20.4s, v13.4h, v0.4h[2]
2051 smlal v22.4s, v12.4h, v1.4h[0]
2052 smlal v22.4s, v13.4h, v6.4h[2]
2053 smlal v16.4s, v12.4h, v7.4h[0]
2054 smlsl v16.4s, v13.4h, v2.4h[2]
2055 smlsl v18.4s, v12.4h, v3.4h[0]
2056 smlsl v18.4s, v13.4h, v4.4h[2]
2057
2058 cmp x12,x9
2059 bhs stage2_shift2
2060
2061
2062 ld1 {v10.4h, v11.4h},[x1],#16
2063 ld1 {v8.4h, v9.4h},[x1],x10
2064
2065
2066
2067 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
2068 smlsl v26.4s, v8.4h, v1.4h[1] //// y1 * cos3(part of b1)
2069 smlsl v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
2070 smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1(part of b3)
2071
2072 smlsl v24.4s, v9.4h, v5.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2073 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2074 smlal v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2075 smlal v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2076
2077
2078
2079
2080
2081 smlal v20.4s, v10.4h, v0.4h[0]
2082 smlsl v20.4s, v11.4h, v7.4h[2]
2083
2084
2085 smlsl v22.4s, v10.4h, v0.4h[0]
2086 smlsl v22.4s, v11.4h, v1.4h[2]
2087
2088 smlsl v16.4s, v10.4h, v0.4h[0]
2089 smlal v16.4s, v11.4h, v5.4h[2]
2090
2091 smlal v18.4s, v10.4h, v0.4h[0]
2092 smlal v18.4s, v11.4h, v3.4h[2]
2093
2094 ld1 {v12.4h, v13.4h},[x1],#16
2095 ld1 {v14.4h, v15.4h},[x1],x10
2096
2097
2098
2099
2100 smlsl v24.4s, v14.4h, v0.4h[1]
2101 smlal v26.4s, v14.4h, v6.4h[1]
2102 smlal v28.4s, v14.4h, v4.4h[1]
2103 smlsl v30.4s, v14.4h, v1.4h[1]
2104
2105
2106 smlsl v24.4s, v15.4h, v3.4h[3]
2107 smlal v26.4s, v15.4h, v0.4h[1]
2108 smlsl v28.4s, v15.4h, v5.4h[1]
2109 smlsl v30.4s, v15.4h, v6.4h[1]
2110
2111
2112 smlsl v20.4s, v12.4h, v3.4h[0]
2113 smlsl v20.4s, v13.4h, v1.4h[2]
2114 smlsl v22.4s, v12.4h, v7.4h[0]
2115 smlal v22.4s, v13.4h, v3.4h[2]
2116 smlal v16.4s, v12.4h, v1.4h[0]
2117 smlal v16.4s, v13.4h, v7.4h[2]
2118 smlsl v18.4s, v12.4h, v5.4h[0]
2119 smlsl v18.4s, v13.4h, v2.4h[2]
2120
2121
2122 ld1 {v10.4h, v11.4h},[x1],#16
2123 ld1 {v8.4h, v9.4h},[x1],x10
2124
2125
2126 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
2127 smlal v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
2128 smlsl v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
2129 smlal v30.4s, v8.4h, v2.4h[1] //// y1 * sin1(part of b3)
2130
2131 smlal v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2132 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2133 smlsl v28.4s, v9.4h, v7.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2134 smlal v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2135
2136
2137
2138
2139
2140 smlsl v20.4s, v10.4h, v6.4h[0]
2141 smlal v20.4s, v11.4h, v5.4h[2]
2142
2143
2144 smlal v22.4s, v10.4h, v2.4h[0]
2145 smlal v22.4s, v11.4h, v7.4h[2]
2146
2147 smlsl v16.4s, v10.4h, v2.4h[0]
2148 smlsl v16.4s, v11.4h, v4.4h[2]
2149
2150 smlal v18.4s, v10.4h, v6.4h[0]
2151 smlal v18.4s, v11.4h, v1.4h[2]
2152
2153
2154 ld1 {v12.4h, v13.4h},[x1],#16
2155 ld1 {v14.4h, v15.4h},[x1],x10
2156
2157
2158
2159 smlal v24.4s, v14.4h, v1.4h[1]
2160 smlsl v26.4s, v14.4h, v0.4h[3]
2161 smlal v28.4s, v14.4h, v1.4h[3]
2162 smlsl v30.4s, v14.4h, v3.4h[1]
2163
2164
2165 smlal v24.4s, v15.4h, v5.4h[3]
2166 smlsl v26.4s, v15.4h, v5.4h[1]
2167 smlal v28.4s, v15.4h, v4.4h[3]
2168 smlsl v30.4s, v15.4h, v4.4h[1]
2169
2170
2171 smlal v20.4s, v12.4h, v1.4h[0]
2172 smlal v20.4s, v13.4h, v3.4h[2]
2173 smlsl v22.4s, v12.4h, v3.4h[0]
2174 smlsl v22.4s, v13.4h, v2.4h[2]
2175 smlal v16.4s, v12.4h, v5.4h[0]
2176 smlal v16.4s, v13.4h, v1.4h[2]
2177 smlsl v18.4s, v12.4h, v7.4h[0]
2178 smlsl v18.4s, v13.4h, v0.4h[2]
2179
2180stage2_shift2:
2181 add v8.4s, v20.4s , v24.4s
2182 sub v10.4s, v20.4s , v24.4s
2183
2184 add v12.4s, v22.4s , v26.4s
2185 sub v24.4s, v22.4s , v26.4s
2186
2187 add v14.4s, v16.4s , v28.4s
2188 sub v26.4s, v16.4s , v28.4s
2189
2190
2191 add v16.4s, v18.4s , v30.4s
2192 sub v28.4s, v18.4s , v30.4s
2193
2194
2195 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2196 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2197 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2198 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2199 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2200 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2201 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2202 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2203
2204 umov x15,v24.d[0]
2205 umov x16,v25.d[0]
2206 umov x19,v26.d[0]
2207 umov x20,v27.d[0]
2208
2209 trn1 v24.4h, v30.4h, v12.4h
2210 trn2 v25.4h, v30.4h, v12.4h
2211 trn1 v26.4h, v31.4h, v13.4h
2212 trn2 v27.4h, v31.4h, v13.4h
2213
2214 trn1 v30.2s, v24.2s, v26.2s
2215 trn2 v31.2s, v24.2s, v26.2s
2216 trn1 v12.2s, v25.2s, v27.2s
2217 trn2 v13.2s, v25.2s, v27.2s
2218
2219 trn1 v24.4h, v14.4h, v18.4h
2220 trn2 v25.4h, v14.4h, v18.4h
2221 trn1 v26.4h, v15.4h, v19.4h
2222 trn2 v27.4h, v15.4h, v19.4h
2223
2224 trn1 v14.2s, v24.2s, v26.2s
2225 trn2 v15.2s, v24.2s, v26.2s
2226 trn1 v18.2s, v25.2s, v27.2s
2227 trn2 v19.2s, v25.2s, v27.2s
2228
2229 mov v24.d[0],x15
2230 mov v25.d[0],x16
2231 mov v26.d[0],x19
2232 mov v27.d[0],x20
2233
2234 st1 { v30.4h, v31.4h},[x0],#16
2235 st1 { v12.4h, v13.4h},[x0],#16
2236 st1 { v14.4h, v15.4h},[x0],#16
2237 st1 { v18.4h, v19.4h},[x0],#16
2238
2239
2240 mov x1,x4
2241
2242
2243
2244
2245 ld1 {v10.4h, v11.4h},[x1],#16
2246 ld1 {v8.4h, v9.4h},[x1],x10
2247
2248 smull v24.4s, v8.4h, v4.4h[1] //// y1 * cos1(part of b0)
2249 smull v26.4s, v8.4h, v4.4h[3] //// y1 * cos3(part of b1)
2250 smull v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
2251 smull v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
2252
2253 smlsl v24.4s, v9.4h, v3.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2254 smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2255 smlsl v28.4s, v9.4h, v0.4h[2] //// y1 * sin3 - y3 * cos1(part of b2)
2256 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2257
2258
2259
2260
2261
2262 smull v20.4s, v10.4h, v0.4h[0]
2263 smlsl v20.4s, v11.4h, v7.4h[2]
2264
2265
2266 smull v22.4s, v10.4h, v0.4h[0]
2267 smlsl v22.4s, v11.4h, v6.4h[2]
2268
2269 smull v16.4s, v10.4h, v0.4h[0]
2270 smlsl v16.4s, v11.4h, v5.4h[2]
2271
2272 smull v18.4s, v10.4h, v0.4h[0]
2273 smlsl v18.4s, v11.4h, v4.4h[2]
2274
2275 cmp x12,x11
2276 bhs stage2_shift3
2277
2278 ld1 {v12.4h, v13.4h},[x1],#16
2279 ld1 {v14.4h, v15.4h},[x1],x10
2280
2281 smlsl v24.4s, v14.4h, v5.4h[1]
2282 smlsl v26.4s, v14.4h, v7.4h[3]
2283 smlal v28.4s, v14.4h, v5.4h[3]
2284 smlal v30.4s, v14.4h, v3.4h[1]
2285
2286
2287 smlal v24.4s, v15.4h, v2.4h[1]
2288 smlal v26.4s, v15.4h, v1.4h[1]
2289 smlal v28.4s, v15.4h, v4.4h[3]
2290 smlsl v30.4s, v15.4h, v7.4h[3]
2291
2292
2293 smlsl v20.4s, v12.4h, v1.4h[0]
2294 smlal v20.4s, v13.4h, v6.4h[2]
2295 smlsl v22.4s, v12.4h, v3.4h[0]
2296 smlal v22.4s, v13.4h, v3.4h[2]
2297 smlsl v16.4s, v12.4h, v5.4h[0]
2298 smlal v16.4s, v13.4h, v0.4h[2]
2299 smlsl v18.4s, v12.4h, v7.4h[0]
2300 smlal v18.4s, v13.4h, v2.4h[2]
2301
2302 cmp x12,x5
2303 bhs stage2_shift3
2304
2305 ld1 {v10.4h, v11.4h},[x1],#16
2306 ld1 {v8.4h, v9.4h},[x1],x10
2307
2308
2309
2310 smlal v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
2311 smlsl v26.4s, v8.4h, v5.4h[1] //// y1 * cos3(part of b1)
2312 smlsl v28.4s, v8.4h, v0.4h[3] //// y1 * sin3(part of b2)
2313 smlsl v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
2314
2315 smlsl v24.4s, v9.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2316 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2317 smlal v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2318 smlal v30.4s, v9.4h, v0.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2319
2320
2321
2322
2323
2324 smlal v20.4s, v10.4h, v2.4h[0]
2325 smlsl v20.4s, v11.4h, v5.4h[2]
2326
2327
2328 smlal v22.4s, v10.4h, v6.4h[0]
2329 smlsl v22.4s, v11.4h, v0.4h[2]
2330
2331 smlsl v16.4s, v10.4h, v6.4h[0]
2332 smlsl v16.4s, v11.4h, v4.4h[2]
2333
2334 smlsl v18.4s, v10.4h, v2.4h[0]
2335 smlal v18.4s, v11.4h, v6.4h[2]
2336
2337 cmp x12,x6
2338 bhs stage2_shift3
2339
2340 ld1 {v12.4h, v13.4h},[x1],#16
2341 ld1 {v14.4h, v15.4h},[x1],x10
2342
2343
2344
2345
2346
2347 smlsl v24.4s, v14.4h, v7.4h[1]
2348 smlal v26.4s, v14.4h, v2.4h[1]
2349 smlal v28.4s, v14.4h, v4.4h[1]
2350 smlsl v30.4s, v14.4h, v5.4h[1]
2351
2352
2353 smlal v24.4s, v15.4h, v0.4h[3]
2354 smlal v26.4s, v15.4h, v7.4h[1]
2355 smlsl v28.4s, v15.4h, v1.4h[1]
2356 smlsl v30.4s, v15.4h, v6.4h[1]
2357
2358
2359 smlsl v20.4s, v12.4h, v3.4h[0]
2360 smlal v20.4s, v13.4h, v4.4h[2]
2361 smlal v22.4s, v12.4h, v7.4h[0]
2362 smlal v22.4s, v13.4h, v2.4h[2]
2363 smlal v16.4s, v12.4h, v1.4h[0]
2364 smlsl v16.4s, v13.4h, v6.4h[2]
2365 smlal v18.4s, v12.4h, v5.4h[0]
2366 smlsl v18.4s, v13.4h, v0.4h[2]
2367
2368 cmp x12,x9
2369 bhs stage2_shift3
2370
2371
2372 ld1 {v10.4h, v11.4h},[x1],#16
2373 ld1 {v8.4h, v9.4h},[x1],x10
2374
2375
2376 smlsl v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
2377 smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3(part of b1)
2378 smlal v28.4s, v8.4h, v6.4h[3] //// y1 * sin3(part of b2)
2379 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
2380
2381 smlsl v24.4s, v9.4h, v0.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2382 smlal v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2383 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2384 smlsl v30.4s, v9.4h, v2.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2385
2386
2387
2388
2389
2390 smlal v20.4s, v10.4h, v0.4h[0]
2391 smlsl v20.4s, v11.4h, v3.4h[2]
2392
2393
2394 smlsl v22.4s, v10.4h, v0.4h[0]
2395 smlsl v22.4s, v11.4h, v5.4h[2]
2396
2397 smlsl v16.4s, v10.4h, v0.4h[0]
2398 smlal v16.4s, v11.4h, v1.4h[2]
2399
2400 smlal v18.4s, v10.4h, v0.4h[0]
2401 smlal v18.4s, v11.4h, v7.4h[2]
2402
2403 ld1 {v12.4h, v13.4h},[x1],#16
2404 ld1 {v14.4h, v15.4h},[x1],x10
2405
2406
2407
2408
2409 smlal v24.4s, v14.4h, v6.4h[3]
2410 smlal v26.4s, v14.4h, v3.4h[3]
2411 smlsl v28.4s, v14.4h, v1.4h[3]
2412 smlal v30.4s, v14.4h, v7.4h[1]
2413
2414
2415 smlal v24.4s, v15.4h, v1.4h[3]
2416 smlsl v26.4s, v15.4h, v2.4h[3]
2417 smlal v28.4s, v15.4h, v7.4h[1]
2418 smlal v30.4s, v15.4h, v4.4h[1]
2419
2420
2421 smlsl v20.4s, v12.4h, v5.4h[0]
2422 smlal v20.4s, v13.4h, v2.4h[2]
2423 smlal v22.4s, v12.4h, v1.4h[0]
2424 smlsl v22.4s, v13.4h, v7.4h[2]
2425 smlsl v16.4s, v12.4h, v7.4h[0]
2426 smlsl v16.4s, v13.4h, v3.4h[2]
2427 smlsl v18.4s, v12.4h, v3.4h[0]
2428 smlal v18.4s, v13.4h, v1.4h[2]
2429
2430
2431 ld1 {v10.4h, v11.4h},[x1],#16
2432 ld1 {v8.4h, v9.4h},[x1],x10
2433
2434
2435 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
2436 smlsl v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
2437 smlal v28.4s, v8.4h, v3.4h[1] //// y1 * sin3(part of b2)
2438 smlsl v30.4s, v8.4h, v0.4h[1] //// y1 * sin1(part of b3)
2439
2440 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2441 smlal v26.4s, v9.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2442 smlsl v28.4s, v9.4h, v2.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2443 smlal v30.4s, v9.4h, v4.4h[3] //// y1 * sin1 - y3 * sin3(part of b3)
2444
2445
2446
2447
2448
2449 smlal v20.4s, v10.4h, v6.4h[0]
2450 smlsl v20.4s, v11.4h, v1.4h[2]
2451
2452
2453 smlsl v22.4s, v10.4h, v2.4h[0]
2454 smlal v22.4s, v11.4h, v4.4h[2]
2455
2456 smlal v16.4s, v10.4h, v2.4h[0]
2457 smlsl v16.4s, v11.4h, v7.4h[2]
2458
2459 smlsl v18.4s, v10.4h, v6.4h[0]
2460 smlsl v18.4s, v11.4h, v5.4h[2]
2461
2462 ld1 {v12.4h, v13.4h},[x1],#16
2463 ld1 {v14.4h, v15.4h},[x1],x10
2464
2465
2466
2467 smlal v24.4s, v14.4h, v4.4h[3]
2468 smlsl v26.4s, v14.4h, v6.4h[1]
2469 smlal v28.4s, v14.4h, v7.4h[3]
2470 smlal v30.4s, v14.4h, v6.4h[3]
2471
2472
2473 smlal v24.4s, v15.4h, v3.4h[3]
2474 smlsl v26.4s, v15.4h, v3.4h[1]
2475 smlal v28.4s, v15.4h, v2.4h[3]
2476 smlsl v30.4s, v15.4h, v2.4h[1]
2477
2478
2479 smlsl v20.4s, v12.4h, v7.4h[0]
2480 smlal v20.4s, v13.4h, v0.4h[2]
2481 smlal v22.4s, v12.4h, v5.4h[0]
2482 smlsl v22.4s, v13.4h, v1.4h[2]
2483 smlsl v16.4s, v12.4h, v3.4h[0]
2484 smlal v16.4s, v13.4h, v2.4h[2]
2485 smlal v18.4s, v12.4h, v1.4h[0]
2486 smlsl v18.4s, v13.4h, v3.4h[2]
2487
2488stage2_shift3:
2489 add v8.4s, v20.4s , v24.4s
2490 sub v10.4s, v20.4s , v24.4s
2491
2492 add v12.4s, v22.4s , v26.4s
2493 sub v24.4s, v22.4s , v26.4s
2494
2495 add v14.4s, v16.4s , v28.4s
2496 sub v26.4s, v16.4s , v28.4s
2497
2498
2499 add v16.4s, v18.4s , v30.4s
2500 sub v28.4s, v18.4s , v30.4s
2501
2502
2503 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2504 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2505 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2506 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2507 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2508 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2509 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2510 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2511
2512 umov x15,v24.d[0]
2513 umov x16,v25.d[0]
2514 umov x19,v26.d[0]
2515 umov x20,v27.d[0]
2516
2517 trn1 v24.4h, v30.4h, v12.4h
2518 trn2 v25.4h, v30.4h, v12.4h
2519 trn1 v26.4h, v31.4h, v13.4h
2520 trn2 v27.4h, v31.4h, v13.4h
2521
2522 trn1 v30.2s, v24.2s, v26.2s
2523 trn2 v31.2s, v24.2s, v26.2s
2524 trn1 v12.2s, v25.2s, v27.2s
2525 trn2 v13.2s, v25.2s, v27.2s
2526
2527 trn1 v24.4h, v14.4h, v18.4h
2528 trn2 v25.4h, v14.4h, v18.4h
2529 trn1 v26.4h, v15.4h, v19.4h
2530 trn2 v27.4h, v15.4h, v19.4h
2531
2532 trn1 v14.2s, v24.2s, v26.2s
2533 trn2 v15.2s, v24.2s, v26.2s
2534 trn1 v18.2s, v25.2s, v27.2s
2535 trn2 v19.2s, v25.2s, v27.2s
2536
2537 mov v24.d[0],x15
2538 mov v25.d[0],x16
2539 mov v26.d[0],x19
2540 mov v27.d[0],x20
2541
2542 st1 { v30.4h, v31.4h},[x0],#16
2543 st1 { v12.4h, v13.4h},[x0],#16
2544 st1 { v14.4h, v15.4h},[x0],#16
2545 st1 { v18.4h, v19.4h},[x0],#16
2546
2547
2548
2549 mov x1,x4
2550
2551
2552
2553
2554 ld1 {v10.4h, v11.4h},[x1],#16
2555 ld1 {v8.4h, v9.4h},[x1],x10
2556
2557
2558 smull v24.4s, v8.4h, v6.4h[1] //// y1 * cos1(part of b0)
2559 smull v26.4s, v8.4h, v6.4h[3] //// y1 * cos3(part of b1)
2560 smull v28.4s, v8.4h, v7.4h[1] //// y1 * sin3(part of b2)
2561 smull v30.4s, v8.4h, v7.4h[3] //// y1 * sin1(part of b3)
2562
2563 smlsl v24.4s, v9.4h, v2.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2564 smlsl v26.4s, v9.4h, v4.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2565 smlsl v28.4s, v9.4h, v5.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2566 smlsl v30.4s, v9.4h, v7.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2567
2568
2569
2570
2571
2572 smull v20.4s, v10.4h, v0.4h[0]
2573 smlsl v20.4s, v11.4h, v3.4h[2]
2574
2575
2576 smull v22.4s, v10.4h, v0.4h[0]
2577 smlsl v22.4s, v11.4h, v2.4h[2]
2578
2579 smull v16.4s, v10.4h, v0.4h[0]
2580 smlsl v16.4s, v11.4h, v1.4h[2]
2581
2582 smull v18.4s, v10.4h, v0.4h[0]
2583 smlsl v18.4s, v11.4h, v0.4h[2]
2584
2585 cmp x12,x11
2586 bhs stage2_shift4
2587 ld1 {v12.4h, v13.4h},[x1],#16
2588 ld1 {v14.4h, v15.4h},[x1],x10
2589
2590
2591
2592
2593
2594
2595 smlal v24.4s, v14.4h, v0.4h[1]
2596 smlal v26.4s, v14.4h, v1.4h[3]
2597 smlal v28.4s, v14.4h, v4.4h[1]
2598 smlal v30.4s, v14.4h, v6.4h[3]
2599
2600
2601 smlsl v24.4s, v15.4h, v4.4h[1]
2602 smlsl v26.4s, v15.4h, v0.4h[3]
2603 smlsl v28.4s, v15.4h, v2.4h[3]
2604 smlsl v30.4s, v15.4h, v6.4h[1]
2605
2606
2607 smlal v20.4s, v12.4h, v7.4h[0]
2608 smlal v20.4s, v13.4h, v5.4h[2]
2609 smlal v22.4s, v12.4h, v5.4h[0]
2610 smlsl v22.4s, v13.4h, v7.4h[2]
2611 smlal v16.4s, v12.4h, v3.4h[0]
2612 smlsl v16.4s, v13.4h, v4.4h[2]
2613 smlal v18.4s, v12.4h, v1.4h[0]
2614 smlsl v18.4s, v13.4h, v1.4h[2]
2615
2616 cmp x12,x5
2617 bhs stage2_shift4
2618
2619 ld1 {v10.4h, v11.4h},[x1],#16
2620 ld1 {v8.4h, v9.4h},[x1],x10
2621
2622
2623
2624 smlal v24.4s, v8.4h, v7.4h[3] //// y1 * cos1(part of b0)
2625 smlal v26.4s, v8.4h, v3.4h[1] //// y1 * cos3(part of b1)
2626 smlal v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
2627 smlal v30.4s, v8.4h, v5.4h[3] //// y1 * sin1(part of b3)
2628
2629 smlal v24.4s, v9.4h, v4.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2630 smlsl v26.4s, v9.4h, v5.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2631 smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2632 smlsl v30.4s, v9.4h, v5.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2633
2634
2635
2636
2637
2638 smlsl v20.4s, v10.4h, v2.4h[0]
2639 smlal v20.4s, v11.4h, v1.4h[2]
2640
2641
2642 smlsl v22.4s, v10.4h, v6.4h[0]
2643 smlal v22.4s, v11.4h, v3.4h[2]
2644
2645 smlal v16.4s, v10.4h, v6.4h[0]
2646 smlsl v16.4s, v11.4h, v7.4h[2]
2647
2648 smlal v18.4s, v10.4h, v2.4h[0]
2649 smlsl v18.4s, v11.4h, v2.4h[2]
2650
2651 cmp x12,x6
2652 bhs stage2_shift4
2653
2654
2655 ld1 {v12.4h, v13.4h},[x1],#16
2656 ld1 {v14.4h, v15.4h},[x1],x10
2657
2658
2659
2660
2661
2662
2663 smlsl v24.4s, v14.4h, v1.4h[1]
2664 smlsl v26.4s, v14.4h, v7.4h[3]
2665 smlal v28.4s, v14.4h, v1.4h[3]
2666 smlal v30.4s, v14.4h, v4.4h[3]
2667
2668
2669 smlal v24.4s, v15.4h, v2.4h[1]
2670 smlal v26.4s, v15.4h, v5.4h[1]
2671 smlsl v28.4s, v15.4h, v3.4h[1]
2672 smlsl v30.4s, v15.4h, v4.4h[1]
2673
2674
2675 smlsl v20.4s, v12.4h, v5.4h[0]
2676 smlsl v20.4s, v13.4h, v7.4h[2]
2677 smlsl v22.4s, v12.4h, v1.4h[0]
2678 smlal v22.4s, v13.4h, v1.4h[2]
2679 smlsl v16.4s, v12.4h, v7.4h[0]
2680 smlal v16.4s, v13.4h, v5.4h[2]
2681 smlal v18.4s, v12.4h, v3.4h[0]
2682 smlsl v18.4s, v13.4h, v3.4h[2]
2683
2684 cmp x12,x9
2685 bhs stage2_shift4
2686
2687
2688 ld1 {v10.4h, v11.4h},[x1],#16
2689 ld1 {v8.4h, v9.4h},[x1],x10
2690
2691
2692 smlsl v24.4s, v8.4h, v5.4h[3] //// y1 * cos1(part of b0)
2693 smlsl v26.4s, v8.4h, v2.4h[3] //// y1 * cos3(part of b1)
2694 smlal v28.4s, v8.4h, v4.4h[3] //// y1 * sin3(part of b2)
2695 smlal v30.4s, v8.4h, v3.4h[3] //// y1 * sin1(part of b3)
2696
2697 smlsl v24.4s, v9.4h, v6.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
2698 smlal v26.4s, v9.4h, v0.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
2699 smlsl v28.4s, v9.4h, v6.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
2700 smlsl v30.4s, v9.4h, v3.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2701
2702
2703
2704
2705
2706 smlal v20.4s, v10.4h, v0.4h[0]
2707 smlsl v20.4s, v11.4h, v0.4h[2]
2708
2709
2710 smlsl v22.4s, v10.4h, v0.4h[0]
2711 smlal v22.4s, v11.4h, v6.4h[2]
2712
2713 smlsl v16.4s, v10.4h, v0.4h[0]
2714 smlal v16.4s, v11.4h, v2.4h[2]
2715
2716 smlal v18.4s, v10.4h, v0.4h[0]
2717 smlsl v18.4s, v11.4h, v4.4h[2]
2718
2719 ld1 {v12.4h, v13.4h},[x1],#16
2720 ld1 {v14.4h, v15.4h},[x1],x10
2721
2722
2723
2724
2725 smlal v24.4s, v14.4h, v3.4h[1]
2726 smlsl v26.4s, v14.4h, v2.4h[1]
2727 smlal v28.4s, v14.4h, v7.4h[3]
2728 smlal v30.4s, v14.4h, v2.4h[3]
2729
2730
2731 smlsl v24.4s, v15.4h, v0.4h[3]
2732 smlal v26.4s, v15.4h, v4.4h[3]
2733 smlal v28.4s, v15.4h, v6.4h[3]
2734 smlsl v30.4s, v15.4h, v2.4h[1]
2735
2736
2737 smlal v20.4s, v12.4h, v3.4h[0]
2738 smlsl v20.4s, v13.4h, v6.4h[2]
2739 smlal v22.4s, v12.4h, v7.4h[0]
2740 smlsl v22.4s, v13.4h, v4.4h[2]
2741 smlsl v16.4s, v12.4h, v1.4h[0]
2742 smlal v16.4s, v13.4h, v0.4h[2]
2743 smlal v18.4s, v12.4h, v5.4h[0]
2744 smlsl v18.4s, v13.4h, v5.4h[2]
2745
2746
2747 ld1 {v10.4h, v11.4h},[x1],#16
2748 ld1 {v8.4h, v9.4h},[x1],x10
2749
2750
2751
2752
2753 smlal v24.4s, v8.4h, v3.4h[3] //// y1 * cos1(part of b0)
2754 smlsl v26.4s, v8.4h, v7.4h[1] //// y1 * cos3(part of b1)
2755 smlsl v28.4s, v8.4h, v5.4h[1] //// y1 * sin3(part of b2)
2756 smlal v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
2757
2758 smlsl v24.4s, v9.4h, v7.4h[1] //// y1 * cos1 + y3 * cos3(part of b0)
2759 smlsl v26.4s, v9.4h, v6.4h[1] //// y1 * cos3 - y3 * sin1(part of b1)
2760 smlal v28.4s, v9.4h, v3.4h[3] //// y1 * sin3 - y3 * cos1(part of b2)
2761 smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
2762
2763
2764
2765
2766
2767 smlsl v20.4s, v10.4h, v6.4h[0]
2768 smlal v20.4s, v11.4h, v2.4h[2]
2769
2770
2771 smlal v22.4s, v10.4h, v2.4h[0]
2772 smlsl v22.4s, v11.4h, v0.4h[2]
2773
2774 smlsl v16.4s, v10.4h, v2.4h[0]
2775 smlal v16.4s, v11.4h, v3.4h[2]
2776
2777 smlal v18.4s, v10.4h, v6.4h[0]
2778 smlsl v18.4s, v11.4h, v6.4h[2]
2779
2780
2781 ld1 {v12.4h, v13.4h},[x1],#16
2782 ld1 {v14.4h, v15.4h},[x1],x10
2783
2784
2785
2786 smlsl v24.4s, v14.4h, v5.4h[1]
2787 smlal v26.4s, v14.4h, v3.4h[3]
2788 smlsl v28.4s, v14.4h, v2.4h[1]
2789 smlal v30.4s, v14.4h, v0.4h[3]
2790
2791
2792 smlal v24.4s, v15.4h, v1.4h[3]
2793 smlsl v26.4s, v15.4h, v1.4h[1]
2794 smlal v28.4s, v15.4h, v0.4h[3]
2795 smlsl v30.4s, v15.4h, v0.4h[1]
2796
2797
2798 smlsl v20.4s, v12.4h, v1.4h[0]
2799 smlal v20.4s, v13.4h, v4.4h[2]
2800 smlal v22.4s, v12.4h, v3.4h[0]
2801 smlsl v22.4s, v13.4h, v5.4h[2]
2802 smlsl v16.4s, v12.4h, v5.4h[0]
2803 smlal v16.4s, v13.4h, v6.4h[2]
2804 smlal v18.4s, v12.4h, v7.4h[0]
2805 smlsl v18.4s, v13.4h, v7.4h[2]
2806
2807stage2_shift4:
2808 add v8.4s, v20.4s , v24.4s
2809 sub v10.4s, v20.4s , v24.4s
2810
2811 add v12.4s, v22.4s , v26.4s
2812 sub v24.4s, v22.4s , v26.4s
2813
2814 add v14.4s, v16.4s , v28.4s
2815 sub v26.4s, v16.4s , v28.4s
2816
2817
2818 add v16.4s, v18.4s , v30.4s
2819 sub v28.4s, v18.4s , v30.4s
2820
2821
2822 sqrshrn v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2823 sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2824 sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2825 sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2826 sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2827 sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2828 sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2829 sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2830
2831
2832
2833 umov x15,v24.d[0]
2834 umov x16,v25.d[0]
2835 umov x19,v26.d[0]
2836 umov x20,v27.d[0]
2837
2838 trn1 v24.4h, v30.4h, v12.4h
2839 trn2 v25.4h, v30.4h, v12.4h
2840 trn1 v26.4h, v31.4h, v13.4h
2841 trn2 v27.4h, v31.4h, v13.4h
2842
2843 trn1 v30.2s, v24.2s, v26.2s
2844 trn2 v31.2s, v24.2s, v26.2s
2845 trn1 v12.2s, v25.2s, v27.2s
2846 trn2 v13.2s, v25.2s, v27.2s
2847
2848 trn1 v24.4h, v14.4h, v18.4h
2849 trn2 v25.4h, v14.4h, v18.4h
2850 trn1 v26.4h, v15.4h, v19.4h
2851 trn2 v27.4h, v15.4h, v19.4h
2852
2853 trn1 v14.2s, v24.2s, v26.2s
2854 trn2 v15.2s, v24.2s, v26.2s
2855 trn1 v18.2s, v25.2s, v27.2s
2856 trn2 v19.2s, v25.2s, v27.2s
2857
2858 mov v24.d[0],x15
2859 mov v25.d[0],x16
2860 mov v26.d[0],x19
2861 mov v27.d[0],x20
2862
2863 st1 { v30.4h, v31.4h},[x0],#16
2864 st1 { v12.4h, v13.4h},[x0],#16
2865 st1 { v14.4h, v15.4h},[x0],#16
2866 st1 { v18.4h, v19.4h},[x0],#16
2867
2868
2869
2870
2871 sub x0,x0,#256
2872prediction_buffer:
2873
2874
2875 ld1 {v12.8h},[x0],#16
2876 ld1 {v14.8h},[x0],#16
2877
2878 add x0,x0,#32
2879
2880 ld1 {v16.8h},[x0],#16
2881 ld1 {v18.8h},[x0],#16
2882 add x0,x0,#32
2883
2884 ld1 {v20.8h},[x0],#16
2885 ld1 {v22.8h},[x0],#16
2886
2887
2888 add x0,x0,#32
2889
2890 ld1 {v24.8h},[x0],#16
2891 ld1 {v26.8h},[x0],#16
2892
2893
2894
2895
2896
2897// d12 =x0 1- 4 values
2898// d13 =x2 1- 4 values
2899// d14=x1 1- 4 values
2900// d15=x3 1- 4 values
2901
2902// d16 =x0 5- 8 values
2903// d17 =x2 5- 8 values
2904// d18=x1 5- 8 values
2905// d19=x3 5- 8 values
2906
2907// d20 =x0 9- 12 values
2908// d21 =x2 9- 12 values
2909// d22=x1 9- 12 values
2910// d23=x3 9- 12 values
2911
2912// d24 =x0 13-16 values
2913// d25 =x2 13- 16 values
2914// d26=x1 13- 16 values
2915// d27=x3 13- 16 values
2916
2917 // swapping v12 upper and v16 lower 64bits
2918 mov v13.d[0], v12.d[1]
2919 mov v12.d[1], v16.d[0]
2920 mov v16.d[0], v13.d[0]
2921 // swapping v20 upper and v24 lower 64bits
2922 mov v21.d[0], v20.d[1]
2923 mov v20.d[1], v24.d[0]
2924 mov v24.d[0], v21.d[0]
2925 // swapping v14 uppper and v18 lower 64bits
2926 mov v15.d[0], v14.d[1]
2927 mov v14.d[1], v18.d[0]
2928 mov v18.d[0], v15.d[0]
2929 // swapping v22 upper and v26 lower 64bits
2930 mov v23.d[0], v22.d[1]
2931 mov v22.d[1], v26.d[0]
2932 mov v26.d[0], v23.d[0]
2933
2934
2935 ld1 {v8.8b, v9.8b},[x2],x8
2936 ld1 {v10.8b, v11.8b},[x2],x8
2937 ld1 {v28.8b, v29.8b},[x2],x8
2938 ld1 {v30.8b, v31.8b},[x2],x8
2939
2940
2941 uaddw v12.8h, v12.8h , v8.8b
2942 uaddw v20.8h, v20.8h , v9.8b
2943 uaddw v14.8h, v14.8h , v10.8b
2944 uaddw v22.8h, v22.8h , v11.8b
2945 uaddw v16.8h, v16.8h , v28.8b
2946 uaddw v24.8h, v24.8h , v29.8b
2947 uaddw v18.8h, v18.8h , v30.8b
2948 uaddw v26.8h, v26.8h , v31.8b
2949 sub x2,x2,x8,lsl #2
2950 add x2,x2,#16
2951 sqxtun v12.8b, v12.8h
2952 sqxtun v13.8b, v20.8h
2953 sqxtun v20.8b, v14.8h
2954 sqxtun v21.8b, v22.8h
2955 sqxtun v14.8b, v16.8h
2956 sqxtun v15.8b, v24.8h
2957 sqxtun v22.8b, v18.8h
2958 sqxtun v23.8b, v26.8h
2959
2960
2961 st1 {v12.8b, v13.8b},[x3],x7
2962 st1 {v20.8b, v21.8b},[x3],x7
2963 st1 {v14.8b, v15.8b},[x3],x7
2964 st1 {v22.8b, v23.8b},[x3],x7
2965
2966
2967 sub x3,x3,x7,lsl #2
2968 add x3,x3,#16
2969
2970 ld1 {v12.8h},[x0],#16
2971 ld1 {v14.8h},[x0],#16
2972
2973 sub x0,x0,#96
2974
2975 ld1 {v16.8h},[x0],#16
2976 ld1 {v18.8h},[x0],#16
2977 sub x0,x0,#96
2978
2979 ld1 {v20.8h},[x0],#16
2980 ld1 {v22.8h},[x0],#16
2981
2982
2983 sub x0,x0,#96
2984
2985 ld1 {v24.8h},[x0],#16
2986 ld1 {v26.8h},[x0],#16
2987
2988
2989 sub x0,x0,#64
2990
2991
2992 // swapping v12 upper and v16 lower 64bits
2993 mov v13.d[0], v12.d[1]
2994 mov v12.d[1], v16.d[0]
2995 mov v16.d[0], v13.d[0]
2996 // swapping v20 upper and v24 lower 64bits
2997 mov v21.d[0], v20.d[1]
2998 mov v20.d[1], v24.d[0]
2999 mov v24.d[0], v21.d[0]
3000 // swapping v14 uppper and v18 lower 64bits
3001 mov v15.d[0], v14.d[1]
3002 mov v14.d[1], v18.d[0]
3003 mov v18.d[0], v15.d[0]
3004 // swapping v22 upper and v26 lower 64bits
3005 mov v23.d[0], v22.d[1]
3006 mov v22.d[1], v26.d[0]
3007 mov v26.d[0], v23.d[0]
3008
3009
3010 ld1 {v8.8b, v9.8b},[x2],x8
3011 ld1 {v10.8b, v11.8b},[x2],x8
3012 ld1 {v28.8b, v29.8b},[x2],x8
3013 ld1 {v30.8b, v31.8b},[x2],x8
3014
3015
3016 uaddw v12.8h, v12.8h , v8.8b
3017 uaddw v20.8h, v20.8h , v9.8b
3018 uaddw v14.8h, v14.8h , v10.8b
3019 uaddw v22.8h, v22.8h , v11.8b
3020 uaddw v16.8h, v16.8h , v28.8b
3021 uaddw v24.8h, v24.8h , v29.8b
3022 uaddw v18.8h, v18.8h , v30.8b
3023 uaddw v26.8h, v26.8h , v31.8b
3024 sub x2,x2,#16
3025
3026 sqxtun v12.8b, v12.8h
3027 sqxtun v13.8b, v20.8h
3028 sqxtun v20.8b, v14.8h
3029 sqxtun v21.8b, v22.8h
3030 sqxtun v14.8b, v16.8h
3031 sqxtun v15.8b, v24.8h
3032 sqxtun v22.8b, v18.8h
3033 sqxtun v23.8b, v26.8h
3034
3035
3036 st1 {v12.8b, v13.8b},[x3],x7
3037 st1 {v20.8b, v21.8b},[x3],x7
3038 st1 {v14.8b, v15.8b},[x3],x7
3039 st1 {v22.8b, v23.8b},[x3],x7
3040
3041 sub x3,x3,#16
3042
3043 subs x14,x14,#1
3044 bne dct_stage2
3045 // ldmfd sp!,{x0-x12,pc}
3046 ldp x19, x20,[sp],#16
3047 pop_v_regs
3048 ret
3049
3050
3051
3052
3053