blob: 45cd0e5592c0a92cd4e284dcadc900bce3b259f4 [file] [log] [blame]
DRC321e0682011-05-03 08:47:43 +00001/*
2 * ARM NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
DRC321e0682011-05-03 08:47:43 +000033.arm
34
DRC8c60d222011-06-17 21:12:58 +000035
36#define RESPECT_STRICT_ALIGNMENT 1
37
DRC321e0682011-05-03 08:47:43 +000038/*****************************************************************************/
39
40/* Supplementary macro for setting function attributes */
41.macro asm_function fname
DRC4346f912011-06-14 22:16:50 +000042#ifdef __APPLE__
43 .func _\fname
44 .globl _\fname
45_\fname:
46#else
47 .func \fname
48 .global \fname
DRC321e0682011-05-03 08:47:43 +000049#ifdef __ELF__
DRC4346f912011-06-14 22:16:50 +000050 .hidden \fname
51 .type \fname, %function
DRC321e0682011-05-03 08:47:43 +000052#endif
DRC4346f912011-06-14 22:16:50 +000053\fname:
54#endif
DRC321e0682011-05-03 08:47:43 +000055.endm
56
57/* Transpose a block of 4x4 coefficients in four 64-bit registers */
58.macro transpose_4x4 x0, x1, x2, x3
DRC4346f912011-06-14 22:16:50 +000059 vtrn.16 \x0, \x1
60 vtrn.16 \x2, \x3
61 vtrn.32 \x0, \x2
62 vtrn.32 \x1, \x3
DRC321e0682011-05-03 08:47:43 +000063.endm
64
65/*****************************************************************************/
66
67/*
68 * jsimd_idct_ifast_neon
69 *
70 * This function contains a fast, not so accurate integer implementation of
71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast'
73 * function from jidctfst.c
74 *
75 * TODO: a bit better instructions scheduling is needed.
76 */
77
78#define XFIX_1_082392200 d0[0]
79#define XFIX_1_414213562 d0[1]
80#define XFIX_1_847759065 d0[2]
81#define XFIX_2_613125930 d0[3]
82
83.balign 16
84jsimd_idct_ifast_neon_consts:
85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
89
90/* 1-D IDCT helper macro */
91
92.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \
93 t10, t11, t12, t13, t14
94
95 vsub.s16 \t10, \x0, \x4
96 vadd.s16 \x4, \x0, \x4
97 vswp.s16 \t10, \x0
98 vsub.s16 \t11, \x2, \x6
99 vadd.s16 \x6, \x2, \x6
100 vswp.s16 \t11, \x2
101 vsub.s16 \t10, \x3, \x5
102 vadd.s16 \x5, \x3, \x5
103 vswp.s16 \t10, \x3
104 vsub.s16 \t11, \x1, \x7
105 vadd.s16 \x7, \x1, \x7
106 vswp.s16 \t11, \x1
107
108 vqdmulh.s16 \t13, \x2, d0[1]
109 vadd.s16 \t12, \x3, \x3
110 vadd.s16 \x2, \x2, \t13
111 vqdmulh.s16 \t13, \x3, d0[3]
112 vsub.s16 \t10, \x1, \x3
113 vadd.s16 \t12, \t12, \t13
114 vqdmulh.s16 \t13, \t10, d0[2]
115 vsub.s16 \t11, \x7, \x5
116 vadd.s16 \t10, \t10, \t13
117 vqdmulh.s16 \t13, \t11, d0[1]
118 vadd.s16 \t11, \t11, \t13
119
120 vqdmulh.s16 \t13, \x1, d0[0]
121 vsub.s16 \x2, \x6, \x2
122 vsub.s16 \t14, \x0, \x2
123 vadd.s16 \x2, \x0, \x2
124 vadd.s16 \x0, \x4, \x6
125 vsub.s16 \x4, \x4, \x6
126 vadd.s16 \x1, \x1, \t13
127 vadd.s16 \t13, \x7, \x5
128 vsub.s16 \t12, \t13, \t12
129 vsub.s16 \t12, \t12, \t10
130 vadd.s16 \t11, \t12, \t11
131 vsub.s16 \t10, \x1, \t10
132 vadd.s16 \t10, \t10, \t11
133
134 vsub.s16 \x7, \x0, \t13
135 vadd.s16 \x0, \x0, \t13
136 vadd.s16 \x6, \t14, \t12
137 vsub.s16 \x1, \t14, \t12
138 vsub.s16 \x5, \x2, \t11
139 vadd.s16 \x2, \x2, \t11
140 vsub.s16 \x3, \x4, \t10
141 vadd.s16 \x4, \x4, \t10
142.endm
143
144asm_function jsimd_idct_ifast_neon
145
146 DCT_TABLE .req r0
147 COEF_BLOCK .req r1
148 OUTPUT_BUF .req r2
149 OUTPUT_COL .req r3
150 TMP .req ip
151
152 vpush {d8-d15}
153
154 /* Load constants */
155 adr TMP, jsimd_idct_ifast_neon_consts
156 vld1.16 {d0}, [TMP, :64]
157
158 /* Load all COEF_BLOCK into NEON registers with the following allocation:
159 * 0 1 2 3 | 4 5 6 7
160 * ---------+--------
161 * 0 | d4 | d5
162 * 1 | d6 | d7
163 * 2 | d8 | d9
164 * 3 | d10 | d11
165 * 4 | d12 | d13
166 * 5 | d14 | d15
167 * 6 | d16 | d17
168 * 7 | d18 | d19
169 */
170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]!
171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]!
172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]!
173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]!
174 /* Dequantize */
175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
176 vmul.s16 q2, q2, q10
177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]!
178 vmul.s16 q3, q3, q11
179 vmul.s16 q4, q4, q12
180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]!
181 vmul.s16 q5, q5, q13
182 vmul.s16 q6, q6, q14
183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]!
184 vmul.s16 q7, q7, q15
185 vmul.s16 q8, q8, q10
186 vmul.s16 q9, q9, q11
187
188 /* Pass 1 */
189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
190 /* Transpose */
191 transpose_4x4 d4, d6, d8, d10
192 transpose_4x4 d5, d7, d9, d11
193 transpose_4x4 d12, d14, d16, d18
194 transpose_4x4 d13, d15, d17, d19
195 vswp d12, d5
196 vswp d14, d7
197 vswp d16, d9
198 vswp d18, d11
199
200 /* Pass 2 */
201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14
202 /* Transpose */
203 transpose_4x4 d4, d6, d8, d10
204 transpose_4x4 d5, d7, d9, d11
205 transpose_4x4 d12, d14, d16, d18
206 transpose_4x4 d13, d15, d17, d19
207 vswp d12, d5
208 vswp d14, d7
209 vswp d16, d9
210 vswp d18, d11
211
212 /* Descale and range limit */
213 vmov.s16 q15, #(0x80 << 5)
214 vqadd.s16 q2, q2, q15
215 vqadd.s16 q3, q3, q15
216 vqadd.s16 q4, q4, q15
217 vqadd.s16 q5, q5, q15
218 vqadd.s16 q6, q6, q15
219 vqadd.s16 q7, q7, q15
220 vqadd.s16 q8, q8, q15
221 vqadd.s16 q9, q9, q15
222 vqshrun.s16 d4, q2, #5
223 vqshrun.s16 d6, q3, #5
224 vqshrun.s16 d8, q4, #5
225 vqshrun.s16 d10, q5, #5
226 vqshrun.s16 d12, q6, #5
227 vqshrun.s16 d14, q7, #5
228 vqshrun.s16 d16, q8, #5
229 vqshrun.s16 d18, q9, #5
230
231 /* Store results to the output buffer */
232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18
233 ldr TMP, [OUTPUT_BUF], #4
234 add TMP, TMP, OUTPUT_COL
DRC4346f912011-06-14 22:16:50 +0000235 vst1.8 {\x}, [TMP]!
DRC321e0682011-05-03 08:47:43 +0000236 .endr
237
238 vpop {d8-d15}
239 bx lr
240
241 .unreq DCT_TABLE
242 .unreq COEF_BLOCK
243 .unreq OUTPUT_BUF
244 .unreq OUTPUT_COL
245 .unreq TMP
246.endfunc
247
248.purgem idct_helper
249
250/*****************************************************************************/
251
252/*
DRC8c60d222011-06-17 21:12:58 +0000253 * jsimd_idct_4x4_neon
254 *
255 * This function contains inverse-DCT code for getting reduced-size
256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
258 * function from jpeg-6b (jidctred.c).
259 *
260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
261 * requires much less arithmetic operations and hence should be faster.
262 * The primary purpose of this particular NEON optimized function is
263 * bit exact compatibility with jpeg-6b.
264 *
265 * TODO: a bit better instructions scheduling can be achieved by expanding
266 * idct_helper/transpose_4x4 macros and reordering instructions,
267 * but readability will suffer somewhat.
268 */
269
270#define CONST_BITS 13
271
272#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
273#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
274#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
275#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
276#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
277#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
278#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
279#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
280#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
281#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
282#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
283#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
284#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
285#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
286
287.balign 16
288jsimd_idct_4x4_neon_consts:
289 .short FIX_1_847759065 /* d0[0] */
290 .short -FIX_0_765366865 /* d0[1] */
291 .short -FIX_0_211164243 /* d0[2] */
292 .short FIX_1_451774981 /* d0[3] */
293 .short -FIX_2_172734803 /* d1[0] */
294 .short FIX_1_061594337 /* d1[1] */
295 .short -FIX_0_509795579 /* d1[2] */
296 .short -FIX_0_601344887 /* d1[3] */
297 .short FIX_0_899976223 /* d2[0] */
298 .short FIX_2_562915447 /* d2[1] */
299 .short 1 << (CONST_BITS+1) /* d2[2] */
300 .short 0 /* d2[3] */
301
302.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
303 vmull.s16 q14, \x4, d2[2]
304 vmlal.s16 q14, \x8, d0[0]
305 vmlal.s16 q14, \x14, d0[1]
306
307 vmull.s16 q13, \x16, d1[2]
308 vmlal.s16 q13, \x12, d1[3]
309 vmlal.s16 q13, \x10, d2[0]
310 vmlal.s16 q13, \x6, d2[1]
311
312 vmull.s16 q15, \x4, d2[2]
313 vmlsl.s16 q15, \x8, d0[0]
314 vmlsl.s16 q15, \x14, d0[1]
315
316 vmull.s16 q12, \x16, d0[2]
317 vmlal.s16 q12, \x12, d0[3]
318 vmlal.s16 q12, \x10, d1[0]
319 vmlal.s16 q12, \x6, d1[1]
320
321 vadd.s32 q10, q14, q13
322 vsub.s32 q14, q14, q13
323
324.if \shift > 16
325 vrshr.s32 q10, q10, #\shift
326 vrshr.s32 q14, q14, #\shift
327 vmovn.s32 \y26, q10
328 vmovn.s32 \y29, q14
329.else
330 vrshrn.s32 \y26, q10, #\shift
331 vrshrn.s32 \y29, q14, #\shift
332.endif
333
334 vadd.s32 q10, q15, q12
335 vsub.s32 q15, q15, q12
336
337.if \shift > 16
338 vrshr.s32 q10, q10, #\shift
339 vrshr.s32 q15, q15, #\shift
340 vmovn.s32 \y27, q10
341 vmovn.s32 \y28, q15
342.else
343 vrshrn.s32 \y27, q10, #\shift
344 vrshrn.s32 \y28, q15, #\shift
345.endif
346
347.endm
348
349asm_function jsimd_idct_4x4_neon
350
351 DCT_TABLE .req r0
352 COEF_BLOCK .req r1
353 OUTPUT_BUF .req r2
354 OUTPUT_COL .req r3
355 TMP1 .req r0
356 TMP2 .req r1
357 TMP3 .req r2
358 TMP4 .req ip
359
360 vpush {d8-d15}
361
362 /* Load constants (d3 is just used for padding) */
363 adr TMP4, jsimd_idct_4x4_neon_consts
364 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
365
366 /* Load all COEF_BLOCK into NEON registers with the following allocation:
367 * 0 1 2 3 | 4 5 6 7
368 * ---------+--------
369 * 0 | d4 | d5
370 * 1 | d6 | d7
371 * 2 | d8 | d9
372 * 3 | d10 | d11
373 * 4 | - | -
374 * 5 | d12 | d13
375 * 6 | d14 | d15
376 * 7 | d16 | d17
377 */
378 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
379 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
380 add COEF_BLOCK, COEF_BLOCK, #16
381 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
382 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
383 /* dequantize */
384 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
385 vmul.s16 q2, q2, q9
386 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
387 vmul.s16 q3, q3, q10
388 vmul.s16 q4, q4, q11
389 add DCT_TABLE, DCT_TABLE, #16
390 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
391 vmul.s16 q5, q5, q12
392 vmul.s16 q6, q6, q13
393 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
394 vmul.s16 q7, q7, q14
395 vmul.s16 q8, q8, q15
396
397 /* Pass 1 */
398 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
399 transpose_4x4 d4, d6, d8, d10
400 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
401 transpose_4x4 d5, d7, d9, d11
402
403 /* Pass 2 */
404 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
405 transpose_4x4 d26, d27, d28, d29
406
407 /* Range limit */
408 vmov.u16 q15, #0x80
409 vadd.s16 q13, q13, q15
410 vadd.s16 q14, q14, q15
411 vqmovun.s16 d26, q13
412 vqmovun.s16 d27, q14
413
414 /* Store results to the output buffer */
415 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
416 add TMP1, TMP1, OUTPUT_COL
417 add TMP2, TMP2, OUTPUT_COL
418 add TMP3, TMP3, OUTPUT_COL
419 add TMP4, TMP4, OUTPUT_COL
420
421#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
422 /* We can use much less instructions on little endian systems if the
423 * OS kernel is not configured to trap unaligned memory accesses
424 */
425 vst1.32 {d26[0]}, [TMP1]!
426 vst1.32 {d27[0]}, [TMP3]!
427 vst1.32 {d26[1]}, [TMP2]!
428 vst1.32 {d27[1]}, [TMP4]!
429#else
430 vst1.8 {d26[0]}, [TMP1]!
431 vst1.8 {d27[0]}, [TMP3]!
432 vst1.8 {d26[1]}, [TMP1]!
433 vst1.8 {d27[1]}, [TMP3]!
434 vst1.8 {d26[2]}, [TMP1]!
435 vst1.8 {d27[2]}, [TMP3]!
436 vst1.8 {d26[3]}, [TMP1]!
437 vst1.8 {d27[3]}, [TMP3]!
438
439 vst1.8 {d26[4]}, [TMP2]!
440 vst1.8 {d27[4]}, [TMP4]!
441 vst1.8 {d26[5]}, [TMP2]!
442 vst1.8 {d27[5]}, [TMP4]!
443 vst1.8 {d26[6]}, [TMP2]!
444 vst1.8 {d27[6]}, [TMP4]!
445 vst1.8 {d26[7]}, [TMP2]!
446 vst1.8 {d27[7]}, [TMP4]!
447#endif
448
449 vpop {d8-d15}
450 bx lr
451
452 .unreq DCT_TABLE
453 .unreq COEF_BLOCK
454 .unreq OUTPUT_BUF
455 .unreq OUTPUT_COL
456 .unreq TMP1
457 .unreq TMP2
458 .unreq TMP3
459 .unreq TMP4
460.endfunc
461
462.purgem idct_helper
463
464/*****************************************************************************/
465
466/*
467 * jsimd_idct_2x2_neon
468 *
469 * This function contains inverse-DCT code for getting reduced-size
470 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
472 * function from jpeg-6b (jidctred.c).
473 *
474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
475 * requires much less arithmetic operations and hence should be faster.
476 * The primary purpose of this particular NEON optimized function is
477 * bit exact compatibility with jpeg-6b.
478 */
479
480.balign 8
481jsimd_idct_2x2_neon_consts:
482 .short -FIX_0_720959822 /* d0[0] */
483 .short FIX_0_850430095 /* d0[1] */
484 .short -FIX_1_272758580 /* d0[2] */
485 .short FIX_3_624509785 /* d0[3] */
486
487.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
488 vshll.s16 q14, \x4, #15
489 vmull.s16 q13, \x6, d0[3]
490 vmlal.s16 q13, \x10, d0[2]
491 vmlal.s16 q13, \x12, d0[1]
492 vmlal.s16 q13, \x16, d0[0]
493
494 vadd.s32 q10, q14, q13
495 vsub.s32 q14, q14, q13
496
497.if \shift > 16
498 vrshr.s32 q10, q10, #\shift
499 vrshr.s32 q14, q14, #\shift
500 vmovn.s32 \y26, q10
501 vmovn.s32 \y27, q14
502.else
503 vrshrn.s32 \y26, q10, #\shift
504 vrshrn.s32 \y27, q14, #\shift
505.endif
506
507.endm
508
509asm_function jsimd_idct_2x2_neon
510
511 DCT_TABLE .req r0
512 COEF_BLOCK .req r1
513 OUTPUT_BUF .req r2
514 OUTPUT_COL .req r3
515 TMP1 .req r0
516 TMP2 .req ip
517
518 vpush {d8-d15}
519
520 /* Load constants */
521 adr TMP2, jsimd_idct_2x2_neon_consts
522 vld1.16 {d0}, [TMP2, :64]
523
524 /* Load all COEF_BLOCK into NEON registers with the following allocation:
525 * 0 1 2 3 | 4 5 6 7
526 * ---------+--------
527 * 0 | d4 | d5
528 * 1 | d6 | d7
529 * 2 | - | -
530 * 3 | d10 | d11
531 * 4 | - | -
532 * 5 | d12 | d13
533 * 6 | - | -
534 * 7 | d16 | d17
535 */
536 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
537 add COEF_BLOCK, COEF_BLOCK, #16
538 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
539 add COEF_BLOCK, COEF_BLOCK, #16
540 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
541 add COEF_BLOCK, COEF_BLOCK, #16
542 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
543 /* Dequantize */
544 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
545 vmul.s16 q2, q2, q9
546 vmul.s16 q3, q3, q10
547 add DCT_TABLE, DCT_TABLE, #16
548 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
549 vmul.s16 q5, q5, q12
550 add DCT_TABLE, DCT_TABLE, #16
551 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
552 vmul.s16 q6, q6, q13
553 add DCT_TABLE, DCT_TABLE, #16
554 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
555 vmul.s16 q8, q8, q15
556
557 /* Pass 1 */
558#if 0
559 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
560 transpose_4x4 d4, d6, d8, d10
561 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
562 transpose_4x4 d5, d7, d9, d11
563#else
564 vmull.s16 q13, d6, d0[3]
565 vmlal.s16 q13, d10, d0[2]
566 vmlal.s16 q13, d12, d0[1]
567 vmlal.s16 q13, d16, d0[0]
568 vmull.s16 q12, d7, d0[3]
569 vmlal.s16 q12, d11, d0[2]
570 vmlal.s16 q12, d13, d0[1]
571 vmlal.s16 q12, d17, d0[0]
572 vshll.s16 q14, d4, #15
573 vshll.s16 q15, d5, #15
574 vadd.s32 q10, q14, q13
575 vsub.s32 q14, q14, q13
576 vrshrn.s32 d4, q10, #13
577 vrshrn.s32 d6, q14, #13
578 vadd.s32 q10, q15, q12
579 vsub.s32 q14, q15, q12
580 vrshrn.s32 d5, q10, #13
581 vrshrn.s32 d7, q14, #13
582 vtrn.16 q2, q3
583 vtrn.32 q3, q5
584#endif
585
586 /* Pass 2 */
587 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
588
589 /* Range limit */
590 vmov.u16 q15, #0x80
591 vadd.s16 q13, q13, q15
592 vqmovun.s16 d26, q13
593 vqmovun.s16 d27, q13
594
595 /* Store results to the output buffer */
596 ldmia OUTPUT_BUF, {TMP1, TMP2}
597 add TMP1, TMP1, OUTPUT_COL
598 add TMP2, TMP2, OUTPUT_COL
599
600 vst1.8 {d26[0]}, [TMP1]!
601 vst1.8 {d27[4]}, [TMP1]!
602 vst1.8 {d26[1]}, [TMP2]!
603 vst1.8 {d27[5]}, [TMP2]!
604
605 vpop {d8-d15}
606 bx lr
607
608 .unreq DCT_TABLE
609 .unreq COEF_BLOCK
610 .unreq OUTPUT_BUF
611 .unreq OUTPUT_COL
612 .unreq TMP1
613 .unreq TMP2
614.endfunc
615
616.purgem idct_helper
617
618/*****************************************************************************/
619
620/*
DRC321e0682011-05-03 08:47:43 +0000621 * jsimd_ycc_extrgb_convert_neon
622 * jsimd_ycc_extbgr_convert_neon
623 * jsimd_ycc_extrgbx_convert_neon
624 * jsimd_ycc_extbgrx_convert_neon
625 * jsimd_ycc_extxbgr_convert_neon
626 * jsimd_ycc_extxrgb_convert_neon
627 *
628 * Colorspace conversion YCbCr -> RGB
629 */
630
DRC321e0682011-05-03 08:47:43 +0000631
632.macro do_load size
DRC4346f912011-06-14 22:16:50 +0000633 .if \size == 8
DRC321e0682011-05-03 08:47:43 +0000634 vld1.8 {d4}, [U]!
635 vld1.8 {d5}, [V]!
636 vld1.8 {d0}, [Y]!
637 pld [Y, #64]
638 pld [U, #64]
639 pld [V, #64]
DRC4346f912011-06-14 22:16:50 +0000640 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +0000641 vld1.8 {d4[0]}, [U]!
642 vld1.8 {d4[1]}, [U]!
643 vld1.8 {d4[2]}, [U]!
644 vld1.8 {d4[3]}, [U]!
645 vld1.8 {d5[0]}, [V]!
646 vld1.8 {d5[1]}, [V]!
647 vld1.8 {d5[2]}, [V]!
648 vld1.8 {d5[3]}, [V]!
649 vld1.8 {d0[0]}, [Y]!
650 vld1.8 {d0[1]}, [Y]!
651 vld1.8 {d0[2]}, [Y]!
652 vld1.8 {d0[3]}, [Y]!
DRC4346f912011-06-14 22:16:50 +0000653 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +0000654 vld1.8 {d4[4]}, [U]!
655 vld1.8 {d4[5]}, [U]!
656 vld1.8 {d5[4]}, [V]!
657 vld1.8 {d5[5]}, [V]!
658 vld1.8 {d0[4]}, [Y]!
659 vld1.8 {d0[5]}, [Y]!
DRC4346f912011-06-14 22:16:50 +0000660 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +0000661 vld1.8 {d4[6]}, [U]!
662 vld1.8 {d5[6]}, [V]!
663 vld1.8 {d0[6]}, [Y]!
664 .else
665 .error unsupported macroblock size
666 .endif
667.endm
668
669.macro do_store bpp, size
DRC4346f912011-06-14 22:16:50 +0000670 .if \bpp == 24
671 .if \size == 8
DRC321e0682011-05-03 08:47:43 +0000672 vst3.8 {d10, d11, d12}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000673 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +0000674 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
675 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
676 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
677 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000678 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +0000679 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
680 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000681 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +0000682 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
683 .else
684 .error unsupported macroblock size
685 .endif
DRC4346f912011-06-14 22:16:50 +0000686 .elseif \bpp == 32
687 .if \size == 8
DRC321e0682011-05-03 08:47:43 +0000688 vst4.8 {d10, d11, d12, d13}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000689 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +0000690 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
691 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
692 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
693 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000694 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +0000695 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
696 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +0000697 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +0000698 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
699 .else
700 .error unsupported macroblock size
701 .endif
702 .else
703 .error unsupported bpp
704 .endif
705.endm
706
707.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
708
709.macro do_yuv_to_rgb
710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
720 vrshrn.s32 d20, q10, #15
721 vrshrn.s32 d21, q11, #15
722 vrshrn.s32 d24, q12, #14
723 vrshrn.s32 d25, q13, #14
724 vrshrn.s32 d28, q14, #14
725 vrshrn.s32 d29, q15, #14
726 vaddw.u8 q10, q10, d0
727 vaddw.u8 q12, q12, d0
728 vaddw.u8 q14, q14, d0
DRC4346f912011-06-14 22:16:50 +0000729 vqmovun.s16 d1\g_offs, q10
730 vqmovun.s16 d1\r_offs, q12
731 vqmovun.s16 d1\b_offs, q14
DRC321e0682011-05-03 08:47:43 +0000732.endm
733
DRC4346f912011-06-14 22:16:50 +0000734/* Apple gas crashes on adrl, work around that by using adr.
735 * But this requires a copy of these constants for each function.
736 */
737
738.balign 16
739jsimd_ycc_\colorid\()_neon_consts:
740 .short 0, 0, 0, 0
741 .short 22971, -11277, -23401, 29033
742 .short -128, -128, -128, -128
743 .short -128, -128, -128, -128
744
745asm_function jsimd_ycc_\colorid\()_convert_neon
DRC321e0682011-05-03 08:47:43 +0000746 OUTPUT_WIDTH .req r0
747 INPUT_BUF .req r1
748 INPUT_ROW .req r2
749 OUTPUT_BUF .req r3
750 NUM_ROWS .req r4
751
752 INPUT_BUF0 .req r5
753 INPUT_BUF1 .req r6
754 INPUT_BUF2 .req INPUT_BUF
755
756 RGB .req r7
757 Y .req r8
758 U .req r9
759 V .req r10
760 N .req ip
761
762 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
DRC4346f912011-06-14 22:16:50 +0000763 adr ip, jsimd_ycc_\colorid\()_neon_consts
DRC321e0682011-05-03 08:47:43 +0000764 vld1.16 {d0, d1, d2, d3}, [ip, :128]
765
766 /* Save ARM registers and handle input arguments */
767 push {r4, r5, r6, r7, r8, r9, r10, lr}
768 ldr NUM_ROWS, [sp, #(4 * 8)]
769 ldr INPUT_BUF0, [INPUT_BUF]
770 ldr INPUT_BUF1, [INPUT_BUF, #4]
771 ldr INPUT_BUF2, [INPUT_BUF, #8]
772 .unreq INPUT_BUF
773
774 /* Save NEON registers */
775 vpush {d8-d15}
776
777 /* Initially set d10, d11, d12, d13 to 0xFF */
778 vmov.u8 q5, #255
779 vmov.u8 q6, #255
780
781 /* Outer loop over scanlines */
782 cmp NUM_ROWS, #1
783 blt 9f
7840:
785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
787 mov N, OUTPUT_WIDTH
788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
789 add INPUT_ROW, INPUT_ROW, #1
790 ldr RGB, [OUTPUT_BUF], #4
791
792 /* Inner loop over pixels */
793 subs N, N, #8
794 blt 2f
7951:
796 do_load 8
797 do_yuv_to_rgb
DRC4346f912011-06-14 22:16:50 +0000798 do_store \bpp, 8
DRC321e0682011-05-03 08:47:43 +0000799 subs N, N, #8
800 bge 1b
801 tst N, #7
802 beq 8f
8032:
804 tst N, #4
805 beq 3f
806 do_load 4
8073:
808 tst N, #2
809 beq 4f
810 do_load 2
8114:
812 tst N, #1
813 beq 5f
814 do_load 1
8155:
816 do_yuv_to_rgb
817 tst N, #4
818 beq 6f
DRC4346f912011-06-14 22:16:50 +0000819 do_store \bpp, 4
DRC321e0682011-05-03 08:47:43 +00008206:
821 tst N, #2
822 beq 7f
DRC4346f912011-06-14 22:16:50 +0000823 do_store \bpp, 2
DRC321e0682011-05-03 08:47:43 +00008247:
825 tst N, #1
826 beq 8f
DRC4346f912011-06-14 22:16:50 +0000827 do_store \bpp, 1
DRC321e0682011-05-03 08:47:43 +00008288:
829 subs NUM_ROWS, NUM_ROWS, #1
830 bgt 0b
8319:
832 /* Restore all registers and return */
833 vpop {d8-d15}
834 pop {r4, r5, r6, r7, r8, r9, r10, pc}
835
836 .unreq OUTPUT_WIDTH
837 .unreq INPUT_ROW
838 .unreq OUTPUT_BUF
839 .unreq NUM_ROWS
840 .unreq INPUT_BUF0
841 .unreq INPUT_BUF1
842 .unreq INPUT_BUF2
843 .unreq RGB
844 .unreq Y
845 .unreq U
846 .unreq V
847 .unreq N
848.endfunc
849
850.purgem do_yuv_to_rgb
851
852.endm
853
854/*--------------------------------- id ----- bpp R G B */
855generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
856generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
857generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
858generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
859generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
860generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
861
862.purgem do_load
863.purgem do_store
864
865/*****************************************************************************/
DRCb7400542011-08-10 23:31:13 +0000866
867/*
DRC7a9376c2011-08-12 19:27:20 +0000868 * jsimd_extrgb_ycc_convert_neon
869 * jsimd_extbgr_ycc_convert_neon
870 * jsimd_extrgbx_ycc_convert_neon
871 * jsimd_extbgrx_ycc_convert_neon
872 * jsimd_extxbgr_ycc_convert_neon
873 * jsimd_extxrgb_ycc_convert_neon
874 *
875 * Colorspace conversion RGB -> YCbCr
876 */
877
878.macro do_store size
879 .if \size == 8
880 vst1.8 {d20}, [Y]!
881 vst1.8 {d21}, [U]!
882 vst1.8 {d22}, [V]!
883 .elseif \size == 4
884 vst1.8 {d20[0]}, [Y]!
885 vst1.8 {d20[1]}, [Y]!
886 vst1.8 {d20[2]}, [Y]!
887 vst1.8 {d20[3]}, [Y]!
888 vst1.8 {d21[0]}, [U]!
889 vst1.8 {d21[1]}, [U]!
890 vst1.8 {d21[2]}, [U]!
891 vst1.8 {d21[3]}, [U]!
892 vst1.8 {d22[0]}, [V]!
893 vst1.8 {d22[1]}, [V]!
894 vst1.8 {d22[2]}, [V]!
895 vst1.8 {d22[3]}, [V]!
896 .elseif \size == 2
897 vst1.8 {d20[4]}, [Y]!
898 vst1.8 {d20[5]}, [Y]!
899 vst1.8 {d21[4]}, [U]!
900 vst1.8 {d21[5]}, [U]!
901 vst1.8 {d22[4]}, [V]!
902 vst1.8 {d22[5]}, [V]!
903 .elseif \size == 1
904 vst1.8 {d20[6]}, [Y]!
905 vst1.8 {d21[6]}, [U]!
906 vst1.8 {d22[6]}, [V]!
907 .else
908 .error unsupported macroblock size
909 .endif
910.endm
911
912.macro do_load bpp, size
913 .if \bpp == 24
914 .if \size == 8
915 vld3.8 {d10, d11, d12}, [RGB]!
916 pld [RGB, #128]
917 .elseif \size == 4
918 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
919 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
920 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
921 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
922 .elseif \size == 2
923 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
924 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
925 .elseif \size == 1
926 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
927 .else
928 .error unsupported macroblock size
929 .endif
930 .elseif \bpp == 32
931 .if \size == 8
932 vld4.8 {d10, d11, d12, d13}, [RGB]!
933 pld [RGB, #128]
934 .elseif \size == 4
935 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
936 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
937 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
938 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
939 .elseif \size == 2
940 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
941 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
942 .elseif \size == 1
943 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
944 .else
945 .error unsupported macroblock size
946 .endif
947 .else
948 .error unsupported bpp
949 .endif
950.endm
951
952.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
953
954/*
955 * 2 stage pipelined RGB->YCbCr conversion
956 */
957
958.macro do_rgb_to_yuv_stage1
959 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
960 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
961 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
962 vmull.u16 q7, d4, d0[0]
963 vmlal.u16 q7, d6, d0[1]
964 vmlal.u16 q7, d8, d0[2]
965 vmull.u16 q8, d5, d0[0]
966 vmlal.u16 q8, d7, d0[1]
967 vmlal.u16 q8, d9, d0[2]
968 vrev64.32 q9, q1
969 vrev64.32 q13, q1
970 vmlsl.u16 q9, d4, d0[3]
971 vmlsl.u16 q9, d6, d1[0]
972 vmlal.u16 q9, d8, d1[1]
973 vmlsl.u16 q13, d5, d0[3]
974 vmlsl.u16 q13, d7, d1[0]
975 vmlal.u16 q13, d9, d1[1]
976 vrev64.32 q14, q1
977 vrev64.32 q15, q1
978 vmlal.u16 q14, d4, d1[1]
979 vmlsl.u16 q14, d6, d1[2]
980 vmlsl.u16 q14, d8, d1[3]
981 vmlal.u16 q15, d5, d1[1]
982 vmlsl.u16 q15, d7, d1[2]
983 vmlsl.u16 q15, d9, d1[3]
984.endm
985
986.macro do_rgb_to_yuv_stage2
987 vrshrn.u32 d20, q7, #16
988 vrshrn.u32 d21, q8, #16
989 vshrn.u32 d22, q9, #16
990 vshrn.u32 d23, q13, #16
991 vshrn.u32 d24, q14, #16
992 vshrn.u32 d25, q15, #16
993 vmovn.u16 d20, q10 /* d20 = y */
994 vmovn.u16 d21, q11 /* d21 = u */
995 vmovn.u16 d22, q12 /* d22 = v */
996.endm
997
998.macro do_rgb_to_yuv
999 do_rgb_to_yuv_stage1
1000 do_rgb_to_yuv_stage2
1001.endm
1002
1003.macro do_rgb_to_yuv_stage2_store_load_stage1
1004 vrshrn.u32 d20, q7, #16
1005 vrshrn.u32 d21, q8, #16
1006 vshrn.u32 d22, q9, #16
1007 vrev64.32 q9, q1
1008 vshrn.u32 d23, q13, #16
1009 vrev64.32 q13, q1
1010 vshrn.u32 d24, q14, #16
1011 vshrn.u32 d25, q15, #16
1012 do_load \bpp, 8
1013 vmovn.u16 d20, q10 /* d20 = y */
1014 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1015 vmovn.u16 d21, q11 /* d21 = u */
1016 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1017 vmovn.u16 d22, q12 /* d22 = v */
1018 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1019 vmull.u16 q7, d4, d0[0]
1020 vmlal.u16 q7, d6, d0[1]
1021 vmlal.u16 q7, d8, d0[2]
1022 vst1.8 {d20}, [Y]!
1023 vmull.u16 q8, d5, d0[0]
1024 vmlal.u16 q8, d7, d0[1]
1025 vmlal.u16 q8, d9, d0[2]
1026 vmlsl.u16 q9, d4, d0[3]
1027 vmlsl.u16 q9, d6, d1[0]
1028 vmlal.u16 q9, d8, d1[1]
1029 vst1.8 {d21}, [U]!
1030 vmlsl.u16 q13, d5, d0[3]
1031 vmlsl.u16 q13, d7, d1[0]
1032 vmlal.u16 q13, d9, d1[1]
1033 vrev64.32 q14, q1
1034 vrev64.32 q15, q1
1035 vmlal.u16 q14, d4, d1[1]
1036 vmlsl.u16 q14, d6, d1[2]
1037 vmlsl.u16 q14, d8, d1[3]
1038 vst1.8 {d22}, [V]!
1039 vmlal.u16 q15, d5, d1[1]
1040 vmlsl.u16 q15, d7, d1[2]
1041 vmlsl.u16 q15, d9, d1[3]
1042.endm
1043
1044.balign 16
1045jsimd_\colorid\()_ycc_neon_consts:
1046 .short 19595, 38470, 7471, 11059
1047 .short 21709, 32768, 27439, 5329
1048 .short 32767, 128, 32767, 128
1049 .short 32767, 128, 32767, 128
1050
1051asm_function jsimd_\colorid\()_ycc_convert_neon
1052 OUTPUT_WIDTH .req r0
1053 INPUT_BUF .req r1
1054 OUTPUT_BUF .req r2
1055 OUTPUT_ROW .req r3
1056 NUM_ROWS .req r4
1057
1058 OUTPUT_BUF0 .req r5
1059 OUTPUT_BUF1 .req r6
1060 OUTPUT_BUF2 .req OUTPUT_BUF
1061
1062 RGB .req r7
1063 Y .req r8
1064 U .req r9
1065 V .req r10
1066 N .req ip
1067
1068 /* Load constants to d0, d1, d2, d3 */
1069 adr ip, jsimd_\colorid\()_ycc_neon_consts
1070 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1071
1072 /* Save ARM registers and handle input arguments */
1073 push {r4, r5, r6, r7, r8, r9, r10, lr}
1074 ldr NUM_ROWS, [sp, #(4 * 8)]
1075 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1076 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1077 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1078 .unreq OUTPUT_BUF
1079
1080 /* Save NEON registers */
1081 vpush {d8-d15}
1082
1083 /* Outer loop over scanlines */
1084 cmp NUM_ROWS, #1
1085 blt 9f
10860:
1087 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1088 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1089 mov N, OUTPUT_WIDTH
1090 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1091 add OUTPUT_ROW, OUTPUT_ROW, #1
1092 ldr RGB, [INPUT_BUF], #4
1093
1094 /* Inner loop over pixels */
1095 subs N, N, #8
1096 blt 3f
1097 do_load \bpp, 8
1098 do_rgb_to_yuv_stage1
1099 subs N, N, #8
1100 blt 2f
11011:
1102 do_rgb_to_yuv_stage2_store_load_stage1
1103 subs N, N, #8
1104 bge 1b
11052:
1106 do_rgb_to_yuv_stage2
1107 do_store 8
1108 tst N, #7
1109 beq 8f
11103:
1111 tst N, #4
1112 beq 3f
1113 do_load \bpp, 4
11143:
1115 tst N, #2
1116 beq 4f
1117 do_load \bpp, 2
11184:
1119 tst N, #1
1120 beq 5f
1121 do_load \bpp, 1
11225:
1123 do_rgb_to_yuv
1124 tst N, #4
1125 beq 6f
1126 do_store 4
11276:
1128 tst N, #2
1129 beq 7f
1130 do_store 2
11317:
1132 tst N, #1
1133 beq 8f
1134 do_store 1
11358:
1136 subs NUM_ROWS, NUM_ROWS, #1
1137 bgt 0b
11389:
1139 /* Restore all registers and return */
1140 vpop {d8-d15}
1141 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1142
1143 .unreq OUTPUT_WIDTH
1144 .unreq OUTPUT_ROW
1145 .unreq INPUT_BUF
1146 .unreq NUM_ROWS
1147 .unreq OUTPUT_BUF0
1148 .unreq OUTPUT_BUF1
1149 .unreq OUTPUT_BUF2
1150 .unreq RGB
1151 .unreq Y
1152 .unreq U
1153 .unreq V
1154 .unreq N
1155.endfunc
1156
1157.purgem do_rgb_to_yuv
1158.purgem do_rgb_to_yuv_stage1
1159.purgem do_rgb_to_yuv_stage2
1160.purgem do_rgb_to_yuv_stage2_store_load_stage1
1161
1162.endm
1163
1164/*--------------------------------- id ----- bpp R G B */
1165generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1166generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1167generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1168generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1169generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1170generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1171
1172.purgem do_load
1173.purgem do_store
1174
1175/*****************************************************************************/
1176
1177/*
DRCb7400542011-08-10 23:31:13 +00001178 * Load data into workspace, applying unsigned->signed conversion
1179 *
1180 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1181 * rid of VST1.16 instructions
1182 */
1183
1184asm_function jsimd_convsamp_neon
1185 SAMPLE_DATA .req r0
1186 START_COL .req r1
1187 WORKSPACE .req r2
1188 TMP1 .req r3
1189 TMP2 .req r4
1190 TMP3 .req r5
1191 TMP4 .req ip
1192
1193 push {r4, r5}
1194 vmov.u8 d0, #128
1195
1196 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1197 add TMP1, TMP1, START_COL
1198 add TMP2, TMP2, START_COL
1199 add TMP3, TMP3, START_COL
1200 add TMP4, TMP4, START_COL
1201 vld1.8 {d16}, [TMP1]
1202 vsubl.u8 q8, d16, d0
1203 vld1.8 {d18}, [TMP2]
1204 vsubl.u8 q9, d18, d0
1205 vld1.8 {d20}, [TMP3]
1206 vsubl.u8 q10, d20, d0
1207 vld1.8 {d22}, [TMP4]
1208 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1209 vsubl.u8 q11, d22, d0
1210 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1211 add TMP1, TMP1, START_COL
1212 add TMP2, TMP2, START_COL
1213 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1214 add TMP3, TMP3, START_COL
1215 add TMP4, TMP4, START_COL
1216 vld1.8 {d24}, [TMP1]
1217 vsubl.u8 q12, d24, d0
1218 vld1.8 {d26}, [TMP2]
1219 vsubl.u8 q13, d26, d0
1220 vld1.8 {d28}, [TMP3]
1221 vsubl.u8 q14, d28, d0
1222 vld1.8 {d30}, [TMP4]
1223 vsubl.u8 q15, d30, d0
1224 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1225 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1226 pop {r4, r5}
1227 bx lr
1228
1229 .unreq SAMPLE_DATA
1230 .unreq START_COL
1231 .unreq WORKSPACE
1232 .unreq TMP1
1233 .unreq TMP2
1234 .unreq TMP3
1235 .unreq TMP4
1236.endfunc
1237
1238/*****************************************************************************/
1239
1240/*
1241 * jsimd_fdct_ifast_neon
1242 *
1243 * This function contains a fast, not so accurate integer implementation of
1244 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1245 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1246 * function from jfdctfst.c
1247 *
1248 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1249 * rid of a bunch of VLD1.16 instructions
1250 */
1251
1252#define XFIX_0_382683433 d0[0]
1253#define XFIX_0_541196100 d0[1]
1254#define XFIX_0_707106781 d0[2]
1255#define XFIX_1_306562965 d0[3]
1256
1257.balign 16
1258jsimd_fdct_ifast_neon_consts:
1259 .short (98 * 128) /* XFIX_0_382683433 */
1260 .short (139 * 128) /* XFIX_0_541196100 */
1261 .short (181 * 128) /* XFIX_0_707106781 */
1262 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1263
1264asm_function jsimd_fdct_ifast_neon
1265
1266 DATA .req r0
1267 TMP .req ip
1268
1269 vpush {d8-d15}
1270
1271 /* Load constants */
1272 adr TMP, jsimd_fdct_ifast_neon_consts
1273 vld1.16 {d0}, [TMP, :64]
1274
1275 /* Load all DATA into NEON registers with the following allocation:
1276 * 0 1 2 3 | 4 5 6 7
1277 * ---------+--------
1278 * 0 | d16 | d17 | q8
1279 * 1 | d18 | d19 | q9
1280 * 2 | d20 | d21 | q10
1281 * 3 | d22 | d23 | q11
1282 * 4 | d24 | d25 | q12
1283 * 5 | d26 | d27 | q13
1284 * 6 | d28 | d29 | q14
1285 * 7 | d30 | d31 | q15
1286 */
1287
1288 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
1289 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
1290 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
1291 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
1292 sub DATA, DATA, #(128 - 32)
1293
1294 mov TMP, #2
12951:
1296 /* Transpose */
1297 vtrn.16 q12, q13
1298 vtrn.16 q10, q11
1299 vtrn.16 q8, q9
1300 vtrn.16 q14, q15
1301 vtrn.32 q9, q11
1302 vtrn.32 q13, q15
1303 vtrn.32 q8, q10
1304 vtrn.32 q12, q14
1305 vswp d30, d23
1306 vswp d24, d17
1307 vswp d26, d19
1308 /* 1-D FDCT */
1309 vadd.s16 q2, q11, q12
1310 vswp d28, d21
1311 vsub.s16 q12, q11, q12
1312 vsub.s16 q6, q10, q13
1313 vadd.s16 q10, q10, q13
1314 vsub.s16 q7, q9, q14
1315 vadd.s16 q9, q9, q14
1316 vsub.s16 q1, q8, q15
1317 vadd.s16 q8, q8, q15
1318 vsub.s16 q4, q9, q10
1319 vsub.s16 q5, q8, q2
1320 vadd.s16 q3, q9, q10
1321 vadd.s16 q4, q4, q5
1322 vadd.s16 q2, q8, q2
1323 vqdmulh.s16 q4, q4, XFIX_0_707106781
1324 vadd.s16 q11, q12, q6
1325 vadd.s16 q8, q2, q3
1326 vsub.s16 q12, q2, q3
1327 vadd.s16 q3, q6, q7
1328 vadd.s16 q7, q7, q1
1329 vqdmulh.s16 q3, q3, XFIX_0_707106781
1330 vsub.s16 q6, q11, q7
1331 vadd.s16 q10, q5, q4
1332 vqdmulh.s16 q6, q6, XFIX_0_382683433
1333 vsub.s16 q14, q5, q4
1334 vqdmulh.s16 q11, q11, XFIX_0_541196100
1335 vqdmulh.s16 q5, q7, XFIX_1_306562965
1336 vadd.s16 q4, q1, q3
1337 vsub.s16 q3, q1, q3
1338 vadd.s16 q7, q7, q6
1339 vadd.s16 q11, q11, q6
1340 vadd.s16 q7, q7, q5
1341 vadd.s16 q13, q3, q11
1342 vsub.s16 q11, q3, q11
1343 vadd.s16 q9, q4, q7
1344 vsub.s16 q15, q4, q7
1345 subs TMP, TMP, #1
1346 bne 1b
1347
1348 /* store results */
1349 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
1350 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
1351 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
1352 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
1353
1354 vpop {d8-d15}
1355 bx lr
1356
1357 .unreq DATA
1358 .unreq TMP
1359.endfunc
1360
1361/*****************************************************************************/