blob: e9f83ff3a2832cd7416be60ec2facc87c516ec7e [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//* ihevc_intra_pred_chroma_mode_11_to_17.s
22//*
23//* @brief
24//* contains function definitions for intra prediction chroma mode 11 to 17
25//* functions are coded using neon intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//* akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//* none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//* luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//* uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//* uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//* integer source stride
56//*
57//* @param[in] dst_strd
58//* integer destination stride
59//*
60//* @param[in] nt
61//* size of tranform block
62//*
63//* @param[in] mode
64//* type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//* none
70//*
71//*******************************************************************************
72//*/
73
74//void ihevc_intra_pred_chroma_mode_11_to_17(uword8* pu1_ref,
75// word32 src_strd,
76// uword8* pu1_dst,
77// word32 dst_strd,
78// word32 nt,
79// word32 mode)
80//
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 => src_strd
84//x2 => *pu1_dst
85//x3 => dst_strd
86
87//stack contents from #40
88// nt
89// mode
90
91.text
92.align 4
93.include "ihevc_neon_macros.s"
94
95
96
97.globl ihevc_intra_pred_chroma_mode_11_to_17_av8
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern col_for_intra_chroma
101.extern idx_neg_idx_chroma_11_17
102
103.type ihevc_intra_pred_chroma_mode_11_to_17_av8, %function
104
105ihevc_intra_pred_chroma_mode_11_to_17_av8:
106
107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
108 push_v_regs
109 stp x19, x20,[sp,#-16]!
110
111 adrp x7, :got:gai4_ihevc_ang_table
112 ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table]
113
114 adrp x8, :got:gai4_ihevc_inv_ang_table
115 ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
116
117 add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode]
118 add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table[mode - 11]
119 sub x8, x8, #44
120
121 ldr w7, [x7] //intra_pred_ang
122 sxtw x7,w7
123 sub sp, sp, #132 //ref_temp[2 * max_cu_size + 2]
124
125 ldr w8, [x8] //inv_ang
126 sxtw x8,w8
127 add x6, sp, x4, lsl #1 //ref_temp + 2 * nt
128
129 mul x9, x4, x7 //nt*intra_pred_ang
130
131 sub x6, x6, #2 //ref_temp + 2*nt - 2
132
133 add x1, x0, x4, lsl #2 //x1 = &src[4nt]
134 dup v30.8b,w7 //intra_pred_ang
135
136 mov x7, x4
137
138 sub x1,x1,#6 //address calculation for copying 4 halfwords
139
140 asr x9, x9, #5
141
142 ld1 {v0.8b},[x1]
143 rev64 v0.4h, v0.4h
144 st1 {v0.8b},[x6],#8
145
146 sub x1, x1,#8
147
148 subs x7, x7, #4
149 add x20, x1,#8
150 csel x1, x20, x1,eq
151 beq end_loop_copy
152 subs x7,x7,#4
153 beq loop_copy_8
154 subs x7,x7,#8
155 beq loop_copy_16
156
157loop_copy_32:
158 sub x1, x1,#24
159 ld1 {v0.16b, v1.16b},[x1]
160
161 sub x1, x1,#24
162 ld1 {v0.16b, v1.16b},[x1],#32
163
164 rev64 v6.4h, v6.4h
165 rev64 v5.4h, v5.4h
166 rev64 v4.4h, v4.4h
167 rev64 v3.4h, v3.4h
168 rev64 v2.4h, v2.4h
169 rev64 v1.4h, v1.4h
170 rev64 v0.4h, v0.4h
171
172 st1 {v6.8b},[x6],#8
173 st1 {v5.8b},[x6],#8
174 st1 {v4.8b},[x6],#8
175 st1 {v3.8b},[x6],#8
176 st1 {v2.8b},[x6],#8
177 st1 {v1.8b},[x6],#8
178 st1 {v0.8b},[x6],#8
179
180 ld1 {v4.8b, v5.8b, v6.8b},[x1],#24
181 b end_loop_copy
182
183loop_copy_16:
184 sub x1, x1,#16
185 ld1 {v0.8b, v1.8b, v2.8b},[x1]
186
187 rev64 v2.4h, v2.4h
188 rev64 v1.4h, v1.4h
189 rev64 v0.4h, v0.4h
190
191 st1 {v2.8b},[x6],#8
192 st1 {v1.8b},[x6],#8
193 st1 {v0.8b},[x6],#8
194
195 b end_loop_copy
196loop_copy_8:
197 ld1 {v0.8b},[x1]
198 rev64 v0.4h, v0.4h
199 st1 {v0.8b},[x6],#8
200end_loop_copy:
201 sub x1, x1,#2
202
203 ldrh w11, [x1], #-2
204 sxtw x11,w11
205 strh w11, [x6], #2
206 sxtw x11,w11
207
208 cmp x9, #-1
209 bge prologue_8_16_32
210
211 add x6, sp, x4, lsl #1 //ref_temp + 2 * nt
212 sub x6, x6, #4 //ref_temp + 2 * nt - 2 - 2
213
214 mov x12, #-1
215
216 sub x20, x9, x12 //count to take care off ref_idx
217 neg x9, x20
218
219 add x1, x0, x4, lsl #2 //x1 = &src[4nt]
220
221 mov x7, #128 //inv_ang_sum
222
223loop_copy_ref_idx:
224
225 add x7, x7, x8 //inv_ang_sum += inv_ang
226
227 lsr x0, x7, #8
228 lsl x0, x0, #1
229
230 ldrh w11, [x1, x0]
231 sxtw x11,w11
232 strh w11, [x6], #-2
233 sxtw x11,w11
234
235 subs x9, x9, #1
236
237 bne loop_copy_ref_idx
238
239prologue_8_16_32:
240
241 adrp x14, :got:col_for_intra_chroma
242 ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
243
244 lsr x10, x4, #3
245 ld1 {v31.8b},[x14],#8
246 mul x10, x4, x10 //block counter (dec by #8)
247
248 lsl x11, x4, #1 //col counter to be inc/dec by #8
249 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
250
251 sub x7, x5, #11
252
253 adrp x12, :got:idx_neg_idx_chroma_11_17 //load least idx table
254 ldr x12, [x12, #:got_lo12:idx_neg_idx_chroma_11_17]
255
256 add x12, x12, x7, lsl #4
257 mov x8, x12
258
259 mov x7, #8
260 sub x7, x7, x3, lsl #3 //x7 = 8-8x3
261
262 ldr w9, [x8]
263 sxtw x9,w9
264 lsl x9, x9, #1
265 add x1, sp, x4, lsl #1 //ref_temp + 2nt
266
267 xtn v6.8b, v22.8h
268 dup v26.8b,w9 //least idx added to final idx values
269 sub x1, x1, #2 //ref_temp + 2nt - 2
270
271 add x6, x1, x9
272
273 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
274 sshr v22.8h, v22.8h,#5
275
276// mov x0, #31
277 movi v29.8b, #31 //contains #31 for vand operation
278
279// mov x0, #32
280 movi v28.8b, #32
281
282 sqxtn v8.8b, v22.8h
283 shl v8.8b, v8.8b,#1 // 2 * idx
284
285 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0
286
287// mov x0, #2
288 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
289
290 mov x0,#0x100 // idx value for v is +1 of u
291 dup v27.4h,w0
292 add v27.8b, v27.8b , v29.8b
293 mov x0,#0
294
295 add v8.8b, v8.8b , v27.8b //ref_main_idx (add row)
296 sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0)
297 add v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0)
298 tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
299 sub v7.8b, v28.8b , v6.8b //32-fract
300
301 tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
302 add v4.8b, v8.8b , v29.8b //ref_main_idx (row 1)
303 add v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1)
304
305// mov x0, #4 @ 2 *(row * 2 )
306 movi v29.8b, #4
307
308 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
309 umull v24.8h, v12.8b, v7.8b //mul (row 0)
310 umlal v24.8h, v13.8b, v6.8b //mul (row 0)
311
312 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
313 add v8.8b, v8.8b , v29.8b //ref_main_idx (row 2)
314 add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2)
315
316 rshrn v24.8b, v24.8h,#5 //round shft (row 0)
317
318 tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
319 umull v22.8h, v16.8b, v7.8b //mul (row 1)
320 umlal v22.8h, v17.8b, v6.8b //mul (row 1)
321
322 tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
323 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
324 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
325
326 st1 {v24.8b},[x2], x3 //st (row 0)
327 rshrn v22.8b, v22.8h,#5 //round shft (row 1)
328
329 tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
330 umull v20.8h, v14.8b, v7.8b //mul (row 2)
331 umlal v20.8h, v15.8b, v6.8b //mul (row 2)
332
333 tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
334 add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
335 add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
336
337 st1 {v22.8b},[x2], x3 //st (row 1)
338 rshrn v20.8b, v20.8h,#5 //round shft (row 2)
339
340 tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
341 umull v18.8h, v10.8b, v7.8b //mul (row 3)
342 umlal v18.8h, v11.8b, v6.8b //mul (row 3)
343
344 tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
345 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
346 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
347
348 st1 {v20.8b},[x2], x3 //st (row 2)
349 rshrn v18.8b, v18.8h,#5 //round shft (row 3)
350
351 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
352 umull v24.8h, v12.8b, v7.8b //mul (row 4)
353 umlal v24.8h, v13.8b, v6.8b //mul (row 4)
354
355 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
356 add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
357 add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
358
359 st1 {v18.8b},[x2], x3 //st (row 3)
360 cmp x4,#4
361 beq end_func
362 rshrn v24.8b, v24.8h,#5 //round shft (row 4)
363
364 tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
365 umull v22.8h, v16.8b, v7.8b //mul (row 5)
366 umlal v22.8h, v17.8b, v6.8b //mul (row 5)
367
368 tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
369 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
370 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
371
372 st1 {v24.8b},[x2], x3 //st (row 4)
373 rshrn v22.8b, v22.8h,#5 //round shft (row 5)
374
375 tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
376 umull v20.8h, v14.8b, v7.8b //mul (row 6)
377 umlal v20.8h, v15.8b, v6.8b //mul (row 6)
378
379 tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
380 umull v18.8h, v10.8b, v7.8b //mul (row 7)
381 umlal v18.8h, v11.8b, v6.8b //mul (row 7)
382
383 st1 {v22.8b},[x2], x3 //st (row 5)
384 rshrn v20.8b, v20.8h,#5 //round shft (row 6)
385 rshrn v18.8b, v18.8h,#5 //round shft (row 7)
386
387 st1 {v20.8b},[x2], x3 //st (row 6)
388
389 subs x10, x10, #4 //subtract 8 and go to end if 8x8
390
391 st1 {v18.8b},[x2], x3 //st (row 7)
392
393 beq end_func
394
395 subs x11, x11, #8
396 add x20, x8, #4
397 csel x8, x20, x8,gt
398 add x20, x2, x7
399 csel x2, x20, x2,gt
400 csel x8, x12, x8,le
401 sub x20, x2, x4
402 csel x2, x20, x2,le
403 add x20, x2, #8
404 csel x2, x20, x2,le
405 lsl x20, x4, #1
406 csel x11,x20,x11,le
407 bgt lbl400
408 adrp x14, :got:col_for_intra_chroma
409 ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
410lbl400:
411 add x20, x0, #8
412 csel x0, x20, x0,le
413
414 ld1 {v31.8b},[x14],#8
415 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
416 xtn v10.8b, v12.8h
417 sshr v12.8h, v12.8h,#5
418 sqxtn v11.8b, v12.8h
419 shl v11.8b, v11.8b,#1
420 orr x5,x0,x0, lsl#8
421 add x5, x5,#0x002
422 add x5, x5,#0x300
423 dup v27.4h,w5 //row value inc or reset accordingly
424 ldr w9, [x8]
425 sxtw x9,w9
426 lsl x9, x9, #1
427 add x9, x9, x0, lsl #1
428// sub x9, x9, #1
429 dup v26.8b,w9
430 add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
431 mov x5,x2
432
433// sub x4,x4,#8
434
435kernel_8_16_32:
436 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1
437
438 sub v8.8b, v8.8b , v26.8b //ref_main_idx
439 mov v26.8b, v10.8b
440
441 subs x11, x11, #8
442 add x6, x1, x9
443 tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
444 add v9.8b, v29.8b , v8.8b //ref_main_idx + 1
445
446 umull v20.8h, v14.8b, v7.8b //mul (row 6)
447 tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
448 umlal v20.8h, v15.8b, v6.8b //mul (row 6)
449
450 add x20, x0, #8
451 csel x0, x20, x0,le
452 add x20, x8, #4
453 csel x8, x20, x8,gt
454 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx)
455
456 st1 {v24.8b},[x5], x3 //st (row 4)
457 rshrn v24.8b, v22.8h,#5 //round shft (row 5)
458
459 csel x8, x12, x8,le
460 orr x9,x0,x0, lsl#8
461 lsl x9, x9, #1
462 add x9, x9,#0x002
463 add x9, x9,#0x300
464 dup v27.4h,w9 //row value inc or reset accordingly
465
466 bgt lbl452
467 adrp x14, :got:col_for_intra_chroma
468 ldr x14, [x14, #:got_lo12:col_for_intra_chroma]
469lbl452:
470
471 add v4.8b, v29.8b , v8.8b //ref_main_idx (row 1)
472 tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0)
473 add v5.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 1)
474
475 movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1
476
477 umull v18.8h, v10.8b, v7.8b //mul (row 7)
478 tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0)
479 umlal v18.8h, v11.8b, v6.8b //mul (row 7)
480
481 ld1 {v31.8b},[x14],#8
482 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0
483
484 movi v29.8b, #4 //contains #2 for adding to get ref_main_idx + 1
485
486 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
487 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
488
489 add v8.8b, v29.8b , v8.8b //ref_main_idx (row 2)
490 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
491 add v9.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 2)
492
493 lsl x20, x4, #1
494 csel x11,x20,x11,le
495 ldr w9, [x8]
496 sxtw x9,w9
497 lsl x9, x9, #1
498 sub v7.8b, v28.8b , v6.8b //32-fract
499
500 umull v24.8h, v12.8b, v7.8b //mul (row 0)
501 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
502 umlal v24.8h, v13.8b, v6.8b //mul (row 0)
503
504 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
505 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
506
507 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3)
508 tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2)
509 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3)
510
511 umull v22.8h, v16.8b, v7.8b //mul (row 1)
512 tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2)
513 umlal v22.8h, v17.8b, v6.8b //mul (row 1)
514
515 rshrn v24.8b, v24.8h,#5 //round shft (row 0)
516 st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7)
517
518 add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4)
519 tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
520 add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4)
521
522 umull v20.8h, v14.8b, v7.8b //mul (row 2)
523 tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
524 umlal v20.8h, v15.8b, v6.8b //mul (row 2)
525
526 smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col)
527 add x5,x2,x3,lsl#2
528 add x9, x9, x0, lsl #1
529
530
531 st1 {v24.8b},[x2], x3 //st (row 0)
532 rshrn v22.8b, v22.8h,#5 //round shft (row 1)
533
534 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5)
535 tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4)
536 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5)
537
538 umull v18.8h, v10.8b, v7.8b //mul (row 3)
539 tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4)
540 umlal v18.8h, v11.8b, v6.8b //mul (row 3)
541
542 st1 {v22.8b},[x2], x3 //st (row 1)
543 rshrn v20.8b, v20.8h,#5 //round shft (row 2)
544
545 xtn v10.8b, v14.8h
546 sshr v14.8h, v14.8h,#5
547
548 add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6)
549 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
550 add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6)
551
552 umull v24.8h, v12.8b, v7.8b //mul (row 4)
553 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
554 umlal v24.8h, v13.8b, v6.8b //mul (row 4)
555
556 st1 {v20.8b},[x2], x3 //st (row 2)
557 rshrn v18.8b, v18.8h,#5 //round shft (row 3)
558
559// sub x9, x9, #1
560 sqxtn v11.8b, v14.8h
561
562 add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7)
563 tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6)
564 add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7)
565
566 shl v11.8b, v11.8b,#1
567
568 umull v22.8h, v16.8b, v7.8b //mul (row 5)
569 tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6)
570 umlal v22.8h, v17.8b, v6.8b //mul (row 5)
571
572 add v8.8b, v27.8b , v11.8b //ref_main_idx (add row)
573 dup v26.8b,w9
574
575 st1 {v18.8b},[x2], x3 //st (row 3)
576 rshrn v24.8b, v24.8h,#5 //round shft (row 4)
577
578
579 add x2, x2, x3, lsl #2
580 add x20, x7, x2
581 csel x2, x20, x2,gt
582 sub x20, x2, x4, lsl #1
583 csel x2, x20, x2,le
584 add x20,x2,#8
585 csel x2, x20, x2,le
586
587 subs x10, x10, #4 //subtract 8 and go to end if 8x8
588
589 bne kernel_8_16_32
590epil_8_16_32:
591
592 tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
593
594 umull v20.8h, v14.8b, v7.8b //mul (row 6)
595 tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
596 umlal v20.8h, v15.8b, v6.8b //mul (row 6)
597
598 st1 {v24.8b},[x5], x3 //st (row 4)
599 rshrn v24.8b, v22.8h,#5 //round shft (row 5)
600
601 umull v18.8h, v10.8b, v7.8b //mul (row 7)
602 umlal v18.8h, v11.8b, v6.8b //mul (row 7)
603
604 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5)
605 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6)
606
607 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6)
608 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7)
609
610 st1 {v18.8b},[x5], x3 //st (row 7)
611
612end_func:
613 add sp, sp, #132
614 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp
615 ldp x19, x20,[sp],#16
616 pop_v_regs
617 ret
618
619
620
621
622
623