blob: b1c716643a4b60b260fa774ecabe5fc187daf3f8 [file] [log] [blame]
DRC321e0682011-05-03 08:47:43 +00001/*
2 * ARM NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
7 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
DRC321e0682011-05-03 08:47:43 +000033.arm
34
DRC8c60d222011-06-17 21:12:58 +000035
36#define RESPECT_STRICT_ALIGNMENT 1
37
DRC321e0682011-05-03 08:47:43 +000038/*****************************************************************************/
39
40/* Supplementary macro for setting function attributes */
41.macro asm_function fname
DRC4346f912011-06-14 22:16:50 +000042#ifdef __APPLE__
43 .func _\fname
44 .globl _\fname
45_\fname:
46#else
47 .func \fname
48 .global \fname
DRC321e0682011-05-03 08:47:43 +000049#ifdef __ELF__
DRC4346f912011-06-14 22:16:50 +000050 .hidden \fname
51 .type \fname, %function
DRC321e0682011-05-03 08:47:43 +000052#endif
DRC4346f912011-06-14 22:16:50 +000053\fname:
54#endif
DRC321e0682011-05-03 08:47:43 +000055.endm
56
57/* Transpose a block of 4x4 coefficients in four 64-bit registers */
58.macro transpose_4x4 x0, x1, x2, x3
DRC4346f912011-06-14 22:16:50 +000059 vtrn.16 \x0, \x1
60 vtrn.16 \x2, \x3
61 vtrn.32 \x0, \x2
62 vtrn.32 \x1, \x3
DRC321e0682011-05-03 08:47:43 +000063.endm
64
DRCce4e3e82011-08-22 13:48:01 +000065#define CENTERJSAMPLE 128
66
67/*****************************************************************************/
68
69/*
70 * Perform dequantization and inverse DCT on one block of coefficients.
71 *
72 * GLOBAL(void)
73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
74 * JSAMPARRAY output_buf, JDIMENSION output_col)
75 */
76
77#define FIX_0_298631336 (2446)
78#define FIX_0_390180644 (3196)
79#define FIX_0_541196100 (4433)
80#define FIX_0_765366865 (6270)
81#define FIX_0_899976223 (7373)
82#define FIX_1_175875602 (9633)
83#define FIX_1_501321110 (12299)
84#define FIX_1_847759065 (15137)
85#define FIX_1_961570560 (16069)
86#define FIX_2_053119869 (16819)
87#define FIX_2_562915447 (20995)
88#define FIX_3_072711026 (25172)
89
90#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
91#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
92#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
93#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
94#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
95#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
96#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
97#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
98
99/*
100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
102 */
103#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
104{ \
105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
106 INT32 q1, q2, q3, q4, q5, q6, q7; \
107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
108 \
109 /* 1-D iDCT input data */ \
110 row0 = xrow0; \
111 row1 = xrow1; \
112 row2 = xrow2; \
113 row3 = xrow3; \
114 row4 = xrow4; \
115 row5 = xrow5; \
116 row6 = xrow6; \
117 row7 = xrow7; \
118 \
119 q5 = row7 + row3; \
120 q4 = row5 + row1; \
121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
122 MULTIPLY(q4, FIX_1_175875602); \
123 q7 = MULTIPLY(q5, FIX_1_175875602) + \
124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
125 q2 = MULTIPLY(row2, FIX_0_541196100) + \
126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
127 q4 = q6; \
128 q3 = ((INT32) row0 - (INT32) row4) << 13; \
129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
131 /* now we can use q1 (reloadable constants have been used up) */ \
132 q1 = q3 + q2; \
133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
134 MULTIPLY(row1, -FIX_0_899976223); \
135 q5 = q7; \
136 q1 = q1 + q6; \
137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
139 \
140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
141 tmp11_plus_tmp2 = q1; \
142 row1 = 0; \
143 \
144 q1 = q1 - q6; \
145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
146 MULTIPLY(row3, -FIX_2_562915447); \
147 q1 = q1 - q6; \
148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
149 MULTIPLY(row6, FIX_0_541196100); \
150 q3 = q3 - q2; \
151 \
152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
153 tmp11_minus_tmp2 = q1; \
154 \
155 q1 = ((INT32) row0 + (INT32) row4) << 13; \
156 q2 = q1 + q6; \
157 q1 = q1 - q6; \
158 \
159 /* pick up the results */ \
160 tmp0 = q4; \
161 tmp1 = q5; \
162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
163 tmp3 = q7; \
164 tmp10 = q2; \
165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
166 tmp12 = q3; \
167 tmp13 = q1; \
168}
169
170#define XFIX_0_899976223 d0[0]
171#define XFIX_0_541196100 d0[1]
172#define XFIX_2_562915447 d0[2]
173#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
174#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
175#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
176#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
177#define XFIX_1_175875602 d1[3]
178#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
179#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
180#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
181#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
182
183.balign 16
184jsimd_idct_islow_neon_consts:
185 .short FIX_0_899976223 /* d0[0] */
186 .short FIX_0_541196100 /* d0[1] */
187 .short FIX_2_562915447 /* d0[2] */
188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
192 .short FIX_1_175875602 /* d1[3] */
193 /* reloadable constants */
194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
198
199asm_function jsimd_idct_islow_neon
200
201 DCT_TABLE .req r0
202 COEF_BLOCK .req r1
203 OUTPUT_BUF .req r2
204 OUTPUT_COL .req r3
205 TMP1 .req r0
206 TMP2 .req r1
207 TMP3 .req r2
208 TMP4 .req ip
209
210 ROW0L .req d16
211 ROW0R .req d17
212 ROW1L .req d18
213 ROW1R .req d19
214 ROW2L .req d20
215 ROW2R .req d21
216 ROW3L .req d22
217 ROW3R .req d23
218 ROW4L .req d24
219 ROW4R .req d25
220 ROW5L .req d26
221 ROW5R .req d27
222 ROW6L .req d28
223 ROW6R .req d29
224 ROW7L .req d30
225 ROW7R .req d31
226
227 /* Load and dequantize coefficients into NEON registers
228 * with the following allocation:
229 * 0 1 2 3 | 4 5 6 7
230 * ---------+--------
231 * 0 | d16 | d17 ( q8 )
232 * 1 | d18 | d19 ( q9 )
233 * 2 | d20 | d21 ( q10 )
234 * 3 | d22 | d23 ( q11 )
235 * 4 | d24 | d25 ( q12 )
236 * 5 | d26 | d27 ( q13 )
237 * 6 | d28 | d29 ( q14 )
238 * 7 | d30 | d31 ( q15 )
239 */
240 adr ip, jsimd_idct_islow_neon_consts
241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
244 vmul.s16 q8, q8, q0
245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
246 vmul.s16 q9, q9, q1
247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
248 vmul.s16 q10, q10, q2
249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
250 vmul.s16 q11, q11, q3
251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
252 vmul.s16 q12, q12, q0
253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
254 vmul.s16 q14, q14, q2
255 vmul.s16 q13, q13, q1
256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
257 add ip, ip, #16
258 vmul.s16 q15, q15, q3
259 vpush {d8-d15} /* save NEON registers */
260 /* 1-D IDCT, pass 1, left 4x8 half */
261 vadd.s16 d4, ROW7L, ROW3L
262 vadd.s16 d5, ROW5L, ROW1L
263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
264 vmlal.s16 q6, d5, XFIX_1_175875602
265 vmull.s16 q7, d4, XFIX_1_175875602
DRC5129e392011-09-06 18:55:45 +0000266 /* Check for the zero coefficients in the right 4x8 half */
267 push {r4, r5}
DRCce4e3e82011-08-22 13:48:01 +0000268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
269 vsubl.s16 q3, ROW0L, ROW4L
DRC5129e392011-09-06 18:55:45 +0000270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000271 vmull.s16 q2, ROW2L, XFIX_0_541196100
272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
DRC5129e392011-09-06 18:55:45 +0000273 orr r0, r4, r5
DRCce4e3e82011-08-22 13:48:01 +0000274 vmov q4, q6
275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
278 vshl.s32 q3, q3, #13
DRC5129e392011-09-06 18:55:45 +0000279 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000281 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000282 vadd.s32 q1, q3, q2
DRC5129e392011-09-06 18:55:45 +0000283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000284 vmov q5, q7
285 vadd.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000286 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000288 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
290 vrshrn.s32 ROW1L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000292 vsub.s32 q1, q1, q6
293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
DRC5129e392011-09-06 18:55:45 +0000294 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000296 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000297 vsub.s32 q1, q1, q6
298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
DRC5129e392011-09-06 18:55:45 +0000299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000300 vmlal.s16 q6, ROW6L, XFIX_0_541196100
301 vsub.s32 q3, q3, q2
DRC5129e392011-09-06 18:55:45 +0000302 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000303 vrshrn.s32 ROW6L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000304 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000305 vadd.s32 q1, q3, q5
DRC5129e392011-09-06 18:55:45 +0000306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000307 vsub.s32 q3, q3, q5
308 vaddl.s16 q5, ROW0L, ROW4L
DRC5129e392011-09-06 18:55:45 +0000309 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000310 vrshrn.s32 ROW2L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000311 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000312 vrshrn.s32 ROW5L, q3, #11
DRC5129e392011-09-06 18:55:45 +0000313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000314 vshl.s32 q5, q5, #13
315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
DRC5129e392011-09-06 18:55:45 +0000316 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000317 vadd.s32 q2, q5, q6
DRC5129e392011-09-06 18:55:45 +0000318 orrs r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000319 vsub.s32 q1, q5, q6
320 vadd.s32 q6, q2, q7
DRC5129e392011-09-06 18:55:45 +0000321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000322 vsub.s32 q2, q2, q7
323 vadd.s32 q5, q1, q4
DRC5129e392011-09-06 18:55:45 +0000324 orr r0, r4, r5
DRCce4e3e82011-08-22 13:48:01 +0000325 vsub.s32 q3, q1, q4
DRC5129e392011-09-06 18:55:45 +0000326 pop {r4, r5}
DRCce4e3e82011-08-22 13:48:01 +0000327 vrshrn.s32 ROW7L, q2, #11
328 vrshrn.s32 ROW3L, q5, #11
329 vrshrn.s32 ROW0L, q6, #11
330 vrshrn.s32 ROW4L, q3, #11
DRC5129e392011-09-06 18:55:45 +0000331
332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
333
DRCce4e3e82011-08-22 13:48:01 +0000334 /* 1-D IDCT, pass 1, right 4x8 half */
335 vld1.s16 {d2}, [ip, :64] /* reload constants */
336 vadd.s16 d10, ROW7R, ROW3R
337 vadd.s16 d8, ROW5R, ROW1R
338 /* Transpose left 4x8 half */
339 vtrn.16 ROW6L, ROW7L
340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
341 vmlal.s16 q6, d8, XFIX_1_175875602
342 vtrn.16 ROW2L, ROW3L
343 vmull.s16 q7, d10, XFIX_1_175875602
344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
345 vtrn.16 ROW0L, ROW1L
346 vsubl.s16 q3, ROW0R, ROW4R
347 vmull.s16 q2, ROW2R, XFIX_0_541196100
348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
349 vtrn.16 ROW4L, ROW5L
350 vmov q4, q6
351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
353 vtrn.32 ROW1L, ROW3L
354 vshl.s32 q3, q3, #13
355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
356 vtrn.32 ROW4L, ROW6L
357 vadd.s32 q1, q3, q2
358 vmov q5, q7
359 vadd.s32 q1, q1, q6
360 vtrn.32 ROW0L, ROW2L
361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
363 vrshrn.s32 ROW1R, q1, #11
364 vtrn.32 ROW5L, ROW7L
365 vsub.s32 q1, q1, q6
366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
368 vsub.s32 q1, q1, q6
369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
370 vmlal.s16 q6, ROW6R, XFIX_0_541196100
371 vsub.s32 q3, q3, q2
372 vrshrn.s32 ROW6R, q1, #11
373 vadd.s32 q1, q3, q5
374 vsub.s32 q3, q3, q5
375 vaddl.s16 q5, ROW0R, ROW4R
376 vrshrn.s32 ROW2R, q1, #11
377 vrshrn.s32 ROW5R, q3, #11
378 vshl.s32 q5, q5, #13
379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
380 vadd.s32 q2, q5, q6
381 vsub.s32 q1, q5, q6
382 vadd.s32 q6, q2, q7
383 vsub.s32 q2, q2, q7
384 vadd.s32 q5, q1, q4
385 vsub.s32 q3, q1, q4
386 vrshrn.s32 ROW7R, q2, #11
387 vrshrn.s32 ROW3R, q5, #11
388 vrshrn.s32 ROW0R, q6, #11
389 vrshrn.s32 ROW4R, q3, #11
DRC5129e392011-09-06 18:55:45 +0000390 /* Transpose right 4x8 half */
391 vtrn.16 ROW6R, ROW7R
392 vtrn.16 ROW2R, ROW3R
393 vtrn.16 ROW0R, ROW1R
394 vtrn.16 ROW4R, ROW5R
395 vtrn.32 ROW1R, ROW3R
396 vtrn.32 ROW4R, ROW6R
397 vtrn.32 ROW0R, ROW2R
398 vtrn.32 ROW5R, ROW7R
399
4001: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
DRCce4e3e82011-08-22 13:48:01 +0000401 vld1.s16 {d2}, [ip, :64] /* reload constants */
DRC5129e392011-09-06 18:55:45 +0000402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
403 vmlal.s16 q6, ROW1L, XFIX_1_175875602
404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
407 vmlal.s16 q7, ROW3L, XFIX_1_175875602
408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000411 vmull.s16 q2, ROW2L, XFIX_0_541196100
DRC5129e392011-09-06 18:55:45 +0000412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000413 vmov q4, q6
DRC5129e392011-09-06 18:55:45 +0000414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
416 vshl.s32 q3, q3, #13
417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
418 vadd.s32 q1, q3, q2
419 vmov q5, q7
420 vadd.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
423 vshrn.s32 ROW1L, q1, #16
424 vsub.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
427 vsub.s32 q1, q1, q6
428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
DRC5129e392011-09-06 18:55:45 +0000429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000430 vsub.s32 q3, q3, q2
DRC5129e392011-09-06 18:55:45 +0000431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000432 vadd.s32 q1, q3, q5
433 vsub.s32 q3, q3, q5
DRC5129e392011-09-06 18:55:45 +0000434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000435 vshrn.s32 ROW2L, q1, #16
DRC5129e392011-09-06 18:55:45 +0000436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000437 vshl.s32 q5, q5, #13
DRC5129e392011-09-06 18:55:45 +0000438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000439 vadd.s32 q2, q5, q6
440 vsub.s32 q1, q5, q6
441 vadd.s32 q6, q2, q7
442 vsub.s32 q2, q2, q7
443 vadd.s32 q5, q1, q4
444 vsub.s32 q3, q1, q4
DRC5129e392011-09-06 18:55:45 +0000445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000446 vshrn.s32 ROW3L, q5, #16
447 vshrn.s32 ROW0L, q6, #16
DRC5129e392011-09-06 18:55:45 +0000448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000449 /* 1-D IDCT, pass 2, right 4x8 half */
450 vld1.s16 {d2}, [ip, :64] /* reload constants */
DRC5129e392011-09-06 18:55:45 +0000451 vmull.s16 q6, ROW5R, XFIX_1_175875602
452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
455 vmull.s16 q7, ROW7R, XFIX_1_175875602
456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
462 vmov q4, q6
463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000465 vshl.s32 q3, q3, #13
DRC5129e392011-09-06 18:55:45 +0000466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000467 vadd.s32 q1, q3, q2
468 vmov q5, q7
469 vadd.s32 q1, q1, q6
470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000473 vsub.s32 q1, q1, q6
474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
DRC5129e392011-09-06 18:55:45 +0000475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000476 vsub.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000478 vmlal.s16 q6, ROW6R, XFIX_0_541196100
479 vsub.s32 q3, q3, q2
480 vshrn.s32 ROW6R, q1, #16
481 vadd.s32 q1, q3, q5
482 vsub.s32 q3, q3, q5
DRC5129e392011-09-06 18:55:45 +0000483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000485 vshrn.s32 ROW5R, q3, #16
486 vshl.s32 q5, q5, #13
487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
488 vadd.s32 q2, q5, q6
489 vsub.s32 q1, q5, q6
490 vadd.s32 q6, q2, q7
491 vsub.s32 q2, q2, q7
492 vadd.s32 q5, q1, q4
493 vsub.s32 q3, q1, q4
494 vshrn.s32 ROW7R, q2, #16
DRC5129e392011-09-06 18:55:45 +0000495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000497 vshrn.s32 ROW4R, q3, #16
DRC5129e392011-09-06 18:55:45 +0000498
4992: /* Descale to 8-bit and range limit */
500 vqrshrn.s16 d16, q8, #2
501 vqrshrn.s16 d17, q9, #2
502 vqrshrn.s16 d18, q10, #2
503 vqrshrn.s16 d19, q11, #2
DRCce4e3e82011-08-22 13:48:01 +0000504 vpop {d8-d15} /* restore NEON registers */
DRC5129e392011-09-06 18:55:45 +0000505 vqrshrn.s16 d20, q12, #2
506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
507 vtrn.16 q8, q9
508 vqrshrn.s16 d21, q13, #2
509 vqrshrn.s16 d22, q14, #2
510 vmov.u8 q0, #(CENTERJSAMPLE)
511 vqrshrn.s16 d23, q15, #2
512 vtrn.8 d16, d17
513 vtrn.8 d18, d19
514 vadd.u8 q8, q8, q0
515 vadd.u8 q9, q9, q0
516 vtrn.16 q10, q11
517 /* Store results to the output buffer */
518 ldmia OUTPUT_BUF!, {TMP1, TMP2}
519 add TMP1, TMP1, OUTPUT_COL
520 add TMP2, TMP2, OUTPUT_COL
521 vst1.8 {d16}, [TMP1]
522 vtrn.8 d20, d21
523 vst1.8 {d17}, [TMP2]
524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
525 add TMP1, TMP1, OUTPUT_COL
526 add TMP2, TMP2, OUTPUT_COL
527 vst1.8 {d18}, [TMP1]
528 vadd.u8 q10, q10, q0
529 vst1.8 {d19}, [TMP2]
530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
531 add TMP1, TMP1, OUTPUT_COL
532 add TMP2, TMP2, OUTPUT_COL
533 add TMP3, TMP3, OUTPUT_COL
534 add TMP4, TMP4, OUTPUT_COL
535 vtrn.8 d22, d23
536 vst1.8 {d20}, [TMP1]
537 vadd.u8 q11, q11, q0
538 vst1.8 {d21}, [TMP2]
539 vst1.8 {d22}, [TMP3]
540 vst1.8 {d23}, [TMP4]
DRCce4e3e82011-08-22 13:48:01 +0000541 bx lr
542
DRC5129e392011-09-06 18:55:45 +00005433: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
544
545 /* Transpose left 4x8 half */
546 vtrn.16 ROW6L, ROW7L
547 vtrn.16 ROW2L, ROW3L
548 vtrn.16 ROW0L, ROW1L
549 vtrn.16 ROW4L, ROW5L
550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
551 vtrn.32 ROW1L, ROW3L
552 vtrn.32 ROW4L, ROW6L
553 vtrn.32 ROW0L, ROW2L
554 vtrn.32 ROW5L, ROW7L
555
556 cmp r0, #0
557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
558
559 /* Only row 0 is non-zero for the right 4x8 half */
560 vdup.s16 ROW1R, ROW0R[1]
561 vdup.s16 ROW2R, ROW0R[2]
562 vdup.s16 ROW3R, ROW0R[3]
563 vdup.s16 ROW4R, ROW0R[0]
564 vdup.s16 ROW5R, ROW0R[1]
565 vdup.s16 ROW6R, ROW0R[2]
566 vdup.s16 ROW7R, ROW0R[3]
567 vdup.s16 ROW0R, ROW0R[0]
568 b 1b /* Go to 'normal' second pass */
569
5704: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
571 vld1.s16 {d2}, [ip, :64] /* reload constants */
572 vmull.s16 q6, ROW1L, XFIX_1_175875602
573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
574 vmull.s16 q7, ROW3L, XFIX_1_175875602
575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
576 vmull.s16 q2, ROW2L, XFIX_0_541196100
577 vshll.s16 q3, ROW0L, #13
578 vmov q4, q6
579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
581 vadd.s32 q1, q3, q2
582 vmov q5, q7
583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
584 vadd.s32 q1, q1, q6
585 vadd.s32 q6, q6, q6
586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
587 vshrn.s32 ROW1L, q1, #16
588 vsub.s32 q1, q1, q6
589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
590 vsub.s32 q3, q3, q2
591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
592 vadd.s32 q1, q3, q5
593 vsub.s32 q3, q3, q5
594 vshll.s16 q5, ROW0L, #13
595 vshrn.s32 ROW2L, q1, #16
596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
597 vadd.s32 q2, q5, q6
598 vsub.s32 q1, q5, q6
599 vadd.s32 q6, q2, q7
600 vsub.s32 q2, q2, q7
601 vadd.s32 q5, q1, q4
602 vsub.s32 q3, q1, q4
603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
604 vshrn.s32 ROW3L, q5, #16
605 vshrn.s32 ROW0L, q6, #16
606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
608 vld1.s16 {d2}, [ip, :64] /* reload constants */
609 vmull.s16 q6, ROW5L, XFIX_1_175875602
610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
611 vmull.s16 q7, ROW7L, XFIX_1_175875602
612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
613 vmull.s16 q2, ROW6L, XFIX_0_541196100
614 vshll.s16 q3, ROW4L, #13
615 vmov q4, q6
616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
618 vadd.s32 q1, q3, q2
619 vmov q5, q7
620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
621 vadd.s32 q1, q1, q6
622 vadd.s32 q6, q6, q6
623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
625 vsub.s32 q1, q1, q6
626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
627 vsub.s32 q3, q3, q2
628 vshrn.s32 ROW6R, q1, #16
629 vadd.s32 q1, q3, q5
630 vsub.s32 q3, q3, q5
631 vshll.s16 q5, ROW4L, #13
632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
633 vshrn.s32 ROW5R, q3, #16
634 vadd.s32 q2, q5, q6
635 vsub.s32 q1, q5, q6
636 vadd.s32 q6, q2, q7
637 vsub.s32 q2, q2, q7
638 vadd.s32 q5, q1, q4
639 vsub.s32 q3, q1, q4
640 vshrn.s32 ROW7R, q2, #16
641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
643 vshrn.s32 ROW4R, q3, #16
644 b 2b /* Go to epilogue */
645
DRCce4e3e82011-08-22 13:48:01 +0000646 .unreq DCT_TABLE
647 .unreq COEF_BLOCK
648 .unreq OUTPUT_BUF
649 .unreq OUTPUT_COL
650 .unreq TMP1
651 .unreq TMP2
652 .unreq TMP3
653 .unreq TMP4
654
655 .unreq ROW0L
656 .unreq ROW0R
657 .unreq ROW1L
658 .unreq ROW1R
659 .unreq ROW2L
660 .unreq ROW2R
661 .unreq ROW3L
662 .unreq ROW3R
663 .unreq ROW4L
664 .unreq ROW4R
665 .unreq ROW5L
666 .unreq ROW5R
667 .unreq ROW6L
668 .unreq ROW6R
669 .unreq ROW7L
670 .unreq ROW7R
671.endfunc
672
DRC321e0682011-05-03 08:47:43 +0000673/*****************************************************************************/
674
675/*
676 * jsimd_idct_ifast_neon
677 *
678 * This function contains a fast, not so accurate integer implementation of
679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
DRC4b024a62011-08-15 08:36:51 +0000680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
DRC321e0682011-05-03 08:47:43 +0000681 * function from jidctfst.c
682 *
DRC4b024a62011-08-15 08:36:51 +0000683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
684 * But in ARM NEON case some extra additions are required because VQDMULH
685 * instruction can't handle the constants larger than 1. So the expressions
686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
687 * which introduces an extra addition. Overall, there are 6 extra additions
688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
DRC321e0682011-05-03 08:47:43 +0000689 */
690
691#define XFIX_1_082392200 d0[0]
692#define XFIX_1_414213562 d0[1]
693#define XFIX_1_847759065 d0[2]
694#define XFIX_2_613125930 d0[3]
695
696.balign 16
697jsimd_idct_ifast_neon_consts:
698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
702
DRC321e0682011-05-03 08:47:43 +0000703asm_function jsimd_idct_ifast_neon
704
705 DCT_TABLE .req r0
706 COEF_BLOCK .req r1
707 OUTPUT_BUF .req r2
708 OUTPUT_COL .req r3
DRC4b024a62011-08-15 08:36:51 +0000709 TMP1 .req r0
710 TMP2 .req r1
711 TMP3 .req r2
712 TMP4 .req ip
DRC321e0682011-05-03 08:47:43 +0000713
DRC4b024a62011-08-15 08:36:51 +0000714 /* Load and dequantize coefficients into NEON registers
715 * with the following allocation:
DRC321e0682011-05-03 08:47:43 +0000716 * 0 1 2 3 | 4 5 6 7
717 * ---------+--------
DRC4b024a62011-08-15 08:36:51 +0000718 * 0 | d16 | d17 ( q8 )
719 * 1 | d18 | d19 ( q9 )
720 * 2 | d20 | d21 ( q10 )
721 * 3 | d22 | d23 ( q11 )
722 * 4 | d24 | d25 ( q12 )
723 * 5 | d26 | d27 ( q13 )
724 * 6 | d28 | d29 ( q14 )
725 * 7 | d30 | d31 ( q15 )
DRC321e0682011-05-03 08:47:43 +0000726 */
DRC4b024a62011-08-15 08:36:51 +0000727 adr ip, jsimd_idct_ifast_neon_consts
728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
731 vmul.s16 q8, q8, q0
732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
733 vmul.s16 q9, q9, q1
734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
735 vmul.s16 q10, q10, q2
736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737 vmul.s16 q11, q11, q3
738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
739 vmul.s16 q12, q12, q0
740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
741 vmul.s16 q14, q14, q2
742 vmul.s16 q13, q13, q1
743 vld1.16 {d0}, [ip, :64] /* load constants */
744 vmul.s16 q15, q15, q3
745 vpush {d8-d13} /* save NEON registers */
746 /* 1-D IDCT, pass 1 */
747 vsub.s16 q2, q10, q14
748 vadd.s16 q14, q10, q14
749 vsub.s16 q1, q11, q13
750 vadd.s16 q13, q11, q13
751 vsub.s16 q5, q9, q15
752 vadd.s16 q15, q9, q15
753 vqdmulh.s16 q4, q2, XFIX_1_414213562
754 vqdmulh.s16 q6, q1, XFIX_2_613125930
755 vadd.s16 q3, q1, q1
756 vsub.s16 q1, q5, q1
757 vadd.s16 q10, q2, q4
758 vqdmulh.s16 q4, q1, XFIX_1_847759065
759 vsub.s16 q2, q15, q13
760 vadd.s16 q3, q3, q6
761 vqdmulh.s16 q6, q2, XFIX_1_414213562
762 vadd.s16 q1, q1, q4
763 vqdmulh.s16 q4, q5, XFIX_1_082392200
764 vsub.s16 q10, q10, q14
765 vadd.s16 q2, q2, q6
766 vsub.s16 q6, q8, q12
767 vadd.s16 q12, q8, q12
768 vadd.s16 q9, q5, q4
769 vadd.s16 q5, q6, q10
770 vsub.s16 q10, q6, q10
771 vadd.s16 q6, q15, q13
772 vadd.s16 q8, q12, q14
773 vsub.s16 q3, q6, q3
774 vsub.s16 q12, q12, q14
775 vsub.s16 q3, q3, q1
776 vsub.s16 q1, q9, q1
777 vadd.s16 q2, q3, q2
778 vsub.s16 q15, q8, q6
779 vadd.s16 q1, q1, q2
780 vadd.s16 q8, q8, q6
781 vadd.s16 q14, q5, q3
782 vsub.s16 q9, q5, q3
783 vsub.s16 q13, q10, q2
784 vadd.s16 q10, q10, q2
785 /* Transpose */
786 vtrn.16 q8, q9
787 vsub.s16 q11, q12, q1
788 vtrn.16 q14, q15
789 vadd.s16 q12, q12, q1
790 vtrn.16 q10, q11
791 vtrn.16 q12, q13
792 vtrn.32 q9, q11
793 vtrn.32 q12, q14
794 vtrn.32 q8, q10
795 vtrn.32 q13, q15
796 vswp d28, d21
797 vswp d26, d19
798 /* 1-D IDCT, pass 2 */
799 vsub.s16 q2, q10, q14
800 vswp d30, d23
801 vadd.s16 q14, q10, q14
802 vswp d24, d17
803 vsub.s16 q1, q11, q13
804 vadd.s16 q13, q11, q13
805 vsub.s16 q5, q9, q15
806 vadd.s16 q15, q9, q15
807 vqdmulh.s16 q4, q2, XFIX_1_414213562
808 vqdmulh.s16 q6, q1, XFIX_2_613125930
809 vadd.s16 q3, q1, q1
810 vsub.s16 q1, q5, q1
811 vadd.s16 q10, q2, q4
812 vqdmulh.s16 q4, q1, XFIX_1_847759065
813 vsub.s16 q2, q15, q13
814 vadd.s16 q3, q3, q6
815 vqdmulh.s16 q6, q2, XFIX_1_414213562
816 vadd.s16 q1, q1, q4
817 vqdmulh.s16 q4, q5, XFIX_1_082392200
818 vsub.s16 q10, q10, q14
819 vadd.s16 q2, q2, q6
820 vsub.s16 q6, q8, q12
821 vadd.s16 q12, q8, q12
822 vadd.s16 q9, q5, q4
823 vadd.s16 q5, q6, q10
824 vsub.s16 q10, q6, q10
825 vadd.s16 q6, q15, q13
826 vadd.s16 q8, q12, q14
827 vsub.s16 q3, q6, q3
828 vsub.s16 q12, q12, q14
829 vsub.s16 q3, q3, q1
830 vsub.s16 q1, q9, q1
831 vadd.s16 q2, q3, q2
832 vsub.s16 q15, q8, q6
833 vadd.s16 q1, q1, q2
834 vadd.s16 q8, q8, q6
835 vadd.s16 q14, q5, q3
836 vsub.s16 q9, q5, q3
837 vsub.s16 q13, q10, q2
838 vpop {d8-d13} /* restore NEON registers */
839 vadd.s16 q10, q10, q2
840 /* Transpose */
841 vtrn.16 q8, q9
842 vsub.s16 q11, q12, q1
843 vtrn.16 q14, q15
844 vadd.s16 q12, q12, q1
845 vtrn.16 q10, q11
846 vtrn.16 q12, q13
DRC321e0682011-05-03 08:47:43 +0000847 /* Descale and range limit */
DRC4b024a62011-08-15 08:36:51 +0000848 vmov.s16 q0, #(0x80 << 5)
849 vtrn.32 q9, q11
850 vtrn.32 q12, q14
851 vtrn.32 q8, q10
852 vtrn.32 q13, q15
853 vswp d24, d17
854 vswp d26, d19
855 vqadd.s16 q8, q8, q0
856 vswp d28, d21
857 vqadd.s16 q9, q9, q0
858 vswp d30, d23
859 vqadd.s16 q10, q10, q0
860 vqadd.s16 q11, q11, q0
861 /* Store results to the output buffer */
862 ldmia OUTPUT_BUF!, {TMP1, TMP2}
863 add TMP1, TMP1, OUTPUT_COL
864 add TMP2, TMP2, OUTPUT_COL
865 vqshrun.s16 d16, q8, #5
866 vqshrun.s16 d17, q9, #5
867 vqshrun.s16 d18, q10, #5
868 vqshrun.s16 d19, q11, #5
869 vst1.8 {d16}, [TMP1]
870 vqadd.s16 q12, q12, q0
871 vqadd.s16 q13, q13, q0
872 vst1.8 {d17}, [TMP2]
873 vqadd.s16 q14, q14, q0
874 vqadd.s16 q15, q15, q0
875 ldmia OUTPUT_BUF!, {TMP1, TMP2}
876 add TMP1, TMP1, OUTPUT_COL
877 add TMP2, TMP2, OUTPUT_COL
878 vst1.8 {d18}, [TMP1]
879 vqshrun.s16 d20, q12, #5
880 vqshrun.s16 d21, q13, #5
881 vst1.8 {d19}, [TMP2]
882 vqshrun.s16 d22, q14, #5
883 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
884 add TMP1, TMP1, OUTPUT_COL
885 add TMP2, TMP2, OUTPUT_COL
886 add TMP3, TMP3, OUTPUT_COL
887 add TMP4, TMP4, OUTPUT_COL
888 vst1.8 {d20}, [TMP1]
889 vqshrun.s16 d23, q15, #5
890 vst1.8 {d21}, [TMP2]
891 vst1.8 {d22}, [TMP3]
892 vst1.8 {d23}, [TMP4]
DRC321e0682011-05-03 08:47:43 +0000893 bx lr
894
895 .unreq DCT_TABLE
896 .unreq COEF_BLOCK
897 .unreq OUTPUT_BUF
898 .unreq OUTPUT_COL
DRC4b024a62011-08-15 08:36:51 +0000899 .unreq TMP1
900 .unreq TMP2
901 .unreq TMP3
902 .unreq TMP4
DRC321e0682011-05-03 08:47:43 +0000903.endfunc
904
DRC321e0682011-05-03 08:47:43 +0000905/*****************************************************************************/
906
907/*
DRC8c60d222011-06-17 21:12:58 +0000908 * jsimd_idct_4x4_neon
909 *
910 * This function contains inverse-DCT code for getting reduced-size
911 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
912 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
913 * function from jpeg-6b (jidctred.c).
914 *
915 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
916 * requires much less arithmetic operations and hence should be faster.
917 * The primary purpose of this particular NEON optimized function is
918 * bit exact compatibility with jpeg-6b.
919 *
920 * TODO: a bit better instructions scheduling can be achieved by expanding
921 * idct_helper/transpose_4x4 macros and reordering instructions,
922 * but readability will suffer somewhat.
923 */
924
925#define CONST_BITS 13
926
927#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
928#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
929#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
930#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
931#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
932#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
933#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
934#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
935#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
936#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
937#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
938#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
939#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
940#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
941
942.balign 16
943jsimd_idct_4x4_neon_consts:
944 .short FIX_1_847759065 /* d0[0] */
945 .short -FIX_0_765366865 /* d0[1] */
946 .short -FIX_0_211164243 /* d0[2] */
947 .short FIX_1_451774981 /* d0[3] */
948 .short -FIX_2_172734803 /* d1[0] */
949 .short FIX_1_061594337 /* d1[1] */
950 .short -FIX_0_509795579 /* d1[2] */
951 .short -FIX_0_601344887 /* d1[3] */
952 .short FIX_0_899976223 /* d2[0] */
953 .short FIX_2_562915447 /* d2[1] */
954 .short 1 << (CONST_BITS+1) /* d2[2] */
955 .short 0 /* d2[3] */
956
957.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
958 vmull.s16 q14, \x4, d2[2]
959 vmlal.s16 q14, \x8, d0[0]
960 vmlal.s16 q14, \x14, d0[1]
961
962 vmull.s16 q13, \x16, d1[2]
963 vmlal.s16 q13, \x12, d1[3]
964 vmlal.s16 q13, \x10, d2[0]
965 vmlal.s16 q13, \x6, d2[1]
966
967 vmull.s16 q15, \x4, d2[2]
968 vmlsl.s16 q15, \x8, d0[0]
969 vmlsl.s16 q15, \x14, d0[1]
970
971 vmull.s16 q12, \x16, d0[2]
972 vmlal.s16 q12, \x12, d0[3]
973 vmlal.s16 q12, \x10, d1[0]
974 vmlal.s16 q12, \x6, d1[1]
975
976 vadd.s32 q10, q14, q13
977 vsub.s32 q14, q14, q13
978
979.if \shift > 16
980 vrshr.s32 q10, q10, #\shift
981 vrshr.s32 q14, q14, #\shift
982 vmovn.s32 \y26, q10
983 vmovn.s32 \y29, q14
984.else
985 vrshrn.s32 \y26, q10, #\shift
986 vrshrn.s32 \y29, q14, #\shift
987.endif
988
989 vadd.s32 q10, q15, q12
990 vsub.s32 q15, q15, q12
991
992.if \shift > 16
993 vrshr.s32 q10, q10, #\shift
994 vrshr.s32 q15, q15, #\shift
995 vmovn.s32 \y27, q10
996 vmovn.s32 \y28, q15
997.else
998 vrshrn.s32 \y27, q10, #\shift
999 vrshrn.s32 \y28, q15, #\shift
1000.endif
1001
1002.endm
1003
1004asm_function jsimd_idct_4x4_neon
1005
1006 DCT_TABLE .req r0
1007 COEF_BLOCK .req r1
1008 OUTPUT_BUF .req r2
1009 OUTPUT_COL .req r3
1010 TMP1 .req r0
1011 TMP2 .req r1
1012 TMP3 .req r2
1013 TMP4 .req ip
1014
1015 vpush {d8-d15}
1016
1017 /* Load constants (d3 is just used for padding) */
1018 adr TMP4, jsimd_idct_4x4_neon_consts
1019 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1020
1021 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1022 * 0 1 2 3 | 4 5 6 7
1023 * ---------+--------
1024 * 0 | d4 | d5
1025 * 1 | d6 | d7
1026 * 2 | d8 | d9
1027 * 3 | d10 | d11
1028 * 4 | - | -
1029 * 5 | d12 | d13
1030 * 6 | d14 | d15
1031 * 7 | d16 | d17
1032 */
1033 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1034 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1035 add COEF_BLOCK, COEF_BLOCK, #16
1036 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1037 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1038 /* dequantize */
1039 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1040 vmul.s16 q2, q2, q9
1041 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1042 vmul.s16 q3, q3, q10
1043 vmul.s16 q4, q4, q11
1044 add DCT_TABLE, DCT_TABLE, #16
1045 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1046 vmul.s16 q5, q5, q12
1047 vmul.s16 q6, q6, q13
1048 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1049 vmul.s16 q7, q7, q14
1050 vmul.s16 q8, q8, q15
1051
1052 /* Pass 1 */
1053 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1054 transpose_4x4 d4, d6, d8, d10
1055 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1056 transpose_4x4 d5, d7, d9, d11
1057
1058 /* Pass 2 */
1059 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1060 transpose_4x4 d26, d27, d28, d29
1061
1062 /* Range limit */
1063 vmov.u16 q15, #0x80
1064 vadd.s16 q13, q13, q15
1065 vadd.s16 q14, q14, q15
1066 vqmovun.s16 d26, q13
1067 vqmovun.s16 d27, q14
1068
1069 /* Store results to the output buffer */
1070 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1071 add TMP1, TMP1, OUTPUT_COL
1072 add TMP2, TMP2, OUTPUT_COL
1073 add TMP3, TMP3, OUTPUT_COL
1074 add TMP4, TMP4, OUTPUT_COL
1075
1076#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1077 /* We can use much less instructions on little endian systems if the
1078 * OS kernel is not configured to trap unaligned memory accesses
1079 */
1080 vst1.32 {d26[0]}, [TMP1]!
1081 vst1.32 {d27[0]}, [TMP3]!
1082 vst1.32 {d26[1]}, [TMP2]!
1083 vst1.32 {d27[1]}, [TMP4]!
1084#else
1085 vst1.8 {d26[0]}, [TMP1]!
1086 vst1.8 {d27[0]}, [TMP3]!
1087 vst1.8 {d26[1]}, [TMP1]!
1088 vst1.8 {d27[1]}, [TMP3]!
1089 vst1.8 {d26[2]}, [TMP1]!
1090 vst1.8 {d27[2]}, [TMP3]!
1091 vst1.8 {d26[3]}, [TMP1]!
1092 vst1.8 {d27[3]}, [TMP3]!
1093
1094 vst1.8 {d26[4]}, [TMP2]!
1095 vst1.8 {d27[4]}, [TMP4]!
1096 vst1.8 {d26[5]}, [TMP2]!
1097 vst1.8 {d27[5]}, [TMP4]!
1098 vst1.8 {d26[6]}, [TMP2]!
1099 vst1.8 {d27[6]}, [TMP4]!
1100 vst1.8 {d26[7]}, [TMP2]!
1101 vst1.8 {d27[7]}, [TMP4]!
1102#endif
1103
1104 vpop {d8-d15}
1105 bx lr
1106
1107 .unreq DCT_TABLE
1108 .unreq COEF_BLOCK
1109 .unreq OUTPUT_BUF
1110 .unreq OUTPUT_COL
1111 .unreq TMP1
1112 .unreq TMP2
1113 .unreq TMP3
1114 .unreq TMP4
1115.endfunc
1116
1117.purgem idct_helper
1118
1119/*****************************************************************************/
1120
1121/*
1122 * jsimd_idct_2x2_neon
1123 *
1124 * This function contains inverse-DCT code for getting reduced-size
1125 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1126 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1127 * function from jpeg-6b (jidctred.c).
1128 *
1129 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1130 * requires much less arithmetic operations and hence should be faster.
1131 * The primary purpose of this particular NEON optimized function is
1132 * bit exact compatibility with jpeg-6b.
1133 */
1134
1135.balign 8
1136jsimd_idct_2x2_neon_consts:
1137 .short -FIX_0_720959822 /* d0[0] */
1138 .short FIX_0_850430095 /* d0[1] */
1139 .short -FIX_1_272758580 /* d0[2] */
1140 .short FIX_3_624509785 /* d0[3] */
1141
1142.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1143 vshll.s16 q14, \x4, #15
1144 vmull.s16 q13, \x6, d0[3]
1145 vmlal.s16 q13, \x10, d0[2]
1146 vmlal.s16 q13, \x12, d0[1]
1147 vmlal.s16 q13, \x16, d0[0]
1148
1149 vadd.s32 q10, q14, q13
1150 vsub.s32 q14, q14, q13
1151
1152.if \shift > 16
1153 vrshr.s32 q10, q10, #\shift
1154 vrshr.s32 q14, q14, #\shift
1155 vmovn.s32 \y26, q10
1156 vmovn.s32 \y27, q14
1157.else
1158 vrshrn.s32 \y26, q10, #\shift
1159 vrshrn.s32 \y27, q14, #\shift
1160.endif
1161
1162.endm
1163
1164asm_function jsimd_idct_2x2_neon
1165
1166 DCT_TABLE .req r0
1167 COEF_BLOCK .req r1
1168 OUTPUT_BUF .req r2
1169 OUTPUT_COL .req r3
1170 TMP1 .req r0
1171 TMP2 .req ip
1172
1173 vpush {d8-d15}
1174
1175 /* Load constants */
1176 adr TMP2, jsimd_idct_2x2_neon_consts
1177 vld1.16 {d0}, [TMP2, :64]
1178
1179 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1180 * 0 1 2 3 | 4 5 6 7
1181 * ---------+--------
1182 * 0 | d4 | d5
1183 * 1 | d6 | d7
1184 * 2 | - | -
1185 * 3 | d10 | d11
1186 * 4 | - | -
1187 * 5 | d12 | d13
1188 * 6 | - | -
1189 * 7 | d16 | d17
1190 */
1191 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1192 add COEF_BLOCK, COEF_BLOCK, #16
1193 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1194 add COEF_BLOCK, COEF_BLOCK, #16
1195 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1196 add COEF_BLOCK, COEF_BLOCK, #16
1197 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1198 /* Dequantize */
1199 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1200 vmul.s16 q2, q2, q9
1201 vmul.s16 q3, q3, q10
1202 add DCT_TABLE, DCT_TABLE, #16
1203 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1204 vmul.s16 q5, q5, q12
1205 add DCT_TABLE, DCT_TABLE, #16
1206 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1207 vmul.s16 q6, q6, q13
1208 add DCT_TABLE, DCT_TABLE, #16
1209 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1210 vmul.s16 q8, q8, q15
1211
1212 /* Pass 1 */
1213#if 0
1214 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1215 transpose_4x4 d4, d6, d8, d10
1216 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1217 transpose_4x4 d5, d7, d9, d11
1218#else
1219 vmull.s16 q13, d6, d0[3]
1220 vmlal.s16 q13, d10, d0[2]
1221 vmlal.s16 q13, d12, d0[1]
1222 vmlal.s16 q13, d16, d0[0]
1223 vmull.s16 q12, d7, d0[3]
1224 vmlal.s16 q12, d11, d0[2]
1225 vmlal.s16 q12, d13, d0[1]
1226 vmlal.s16 q12, d17, d0[0]
1227 vshll.s16 q14, d4, #15
1228 vshll.s16 q15, d5, #15
1229 vadd.s32 q10, q14, q13
1230 vsub.s32 q14, q14, q13
1231 vrshrn.s32 d4, q10, #13
1232 vrshrn.s32 d6, q14, #13
1233 vadd.s32 q10, q15, q12
1234 vsub.s32 q14, q15, q12
1235 vrshrn.s32 d5, q10, #13
1236 vrshrn.s32 d7, q14, #13
1237 vtrn.16 q2, q3
1238 vtrn.32 q3, q5
1239#endif
1240
1241 /* Pass 2 */
1242 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1243
1244 /* Range limit */
1245 vmov.u16 q15, #0x80
1246 vadd.s16 q13, q13, q15
1247 vqmovun.s16 d26, q13
1248 vqmovun.s16 d27, q13
1249
1250 /* Store results to the output buffer */
1251 ldmia OUTPUT_BUF, {TMP1, TMP2}
1252 add TMP1, TMP1, OUTPUT_COL
1253 add TMP2, TMP2, OUTPUT_COL
1254
1255 vst1.8 {d26[0]}, [TMP1]!
1256 vst1.8 {d27[4]}, [TMP1]!
1257 vst1.8 {d26[1]}, [TMP2]!
1258 vst1.8 {d27[5]}, [TMP2]!
1259
1260 vpop {d8-d15}
1261 bx lr
1262
1263 .unreq DCT_TABLE
1264 .unreq COEF_BLOCK
1265 .unreq OUTPUT_BUF
1266 .unreq OUTPUT_COL
1267 .unreq TMP1
1268 .unreq TMP2
1269.endfunc
1270
1271.purgem idct_helper
1272
1273/*****************************************************************************/
1274
1275/*
DRC321e0682011-05-03 08:47:43 +00001276 * jsimd_ycc_extrgb_convert_neon
1277 * jsimd_ycc_extbgr_convert_neon
1278 * jsimd_ycc_extrgbx_convert_neon
1279 * jsimd_ycc_extbgrx_convert_neon
1280 * jsimd_ycc_extxbgr_convert_neon
1281 * jsimd_ycc_extxrgb_convert_neon
1282 *
1283 * Colorspace conversion YCbCr -> RGB
1284 */
1285
DRC321e0682011-05-03 08:47:43 +00001286
1287.macro do_load size
DRC4346f912011-06-14 22:16:50 +00001288 .if \size == 8
DRC98a44fe2011-08-24 23:27:44 +00001289 vld1.8 {d4}, [U, :64]!
1290 vld1.8 {d5}, [V, :64]!
1291 vld1.8 {d0}, [Y, :64]!
DRC321e0682011-05-03 08:47:43 +00001292 pld [U, #64]
1293 pld [V, #64]
DRC98a44fe2011-08-24 23:27:44 +00001294 pld [Y, #64]
DRC4346f912011-06-14 22:16:50 +00001295 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001296 vld1.8 {d4[0]}, [U]!
1297 vld1.8 {d4[1]}, [U]!
1298 vld1.8 {d4[2]}, [U]!
1299 vld1.8 {d4[3]}, [U]!
1300 vld1.8 {d5[0]}, [V]!
1301 vld1.8 {d5[1]}, [V]!
1302 vld1.8 {d5[2]}, [V]!
1303 vld1.8 {d5[3]}, [V]!
1304 vld1.8 {d0[0]}, [Y]!
1305 vld1.8 {d0[1]}, [Y]!
1306 vld1.8 {d0[2]}, [Y]!
1307 vld1.8 {d0[3]}, [Y]!
DRC4346f912011-06-14 22:16:50 +00001308 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001309 vld1.8 {d4[4]}, [U]!
1310 vld1.8 {d4[5]}, [U]!
1311 vld1.8 {d5[4]}, [V]!
1312 vld1.8 {d5[5]}, [V]!
1313 vld1.8 {d0[4]}, [Y]!
1314 vld1.8 {d0[5]}, [Y]!
DRC4346f912011-06-14 22:16:50 +00001315 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001316 vld1.8 {d4[6]}, [U]!
1317 vld1.8 {d5[6]}, [V]!
1318 vld1.8 {d0[6]}, [Y]!
1319 .else
1320 .error unsupported macroblock size
1321 .endif
1322.endm
1323
1324.macro do_store bpp, size
DRC4346f912011-06-14 22:16:50 +00001325 .if \bpp == 24
1326 .if \size == 8
DRC321e0682011-05-03 08:47:43 +00001327 vst3.8 {d10, d11, d12}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001328 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001329 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1330 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1331 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1332 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001333 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001334 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1335 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001336 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001337 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1338 .else
1339 .error unsupported macroblock size
1340 .endif
DRC4346f912011-06-14 22:16:50 +00001341 .elseif \bpp == 32
1342 .if \size == 8
DRC321e0682011-05-03 08:47:43 +00001343 vst4.8 {d10, d11, d12, d13}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001344 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001345 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1346 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1347 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1348 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001349 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001350 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1351 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001352 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001353 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1354 .else
1355 .error unsupported macroblock size
1356 .endif
1357 .else
1358 .error unsupported bpp
1359 .endif
1360.endm
1361
1362.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1363
DRC98a44fe2011-08-24 23:27:44 +00001364/*
1365 * 2 stage pipelined YCbCr->RGB conversion
1366 */
1367
1368.macro do_yuv_to_rgb_stage1
DRC321e0682011-05-03 08:47:43 +00001369 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1370 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1371 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1372 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1373 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1374 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1375 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1376 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1377 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1378 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
DRC98a44fe2011-08-24 23:27:44 +00001379.endm
1380
1381.macro do_yuv_to_rgb_stage2
DRC321e0682011-05-03 08:47:43 +00001382 vrshrn.s32 d20, q10, #15
1383 vrshrn.s32 d21, q11, #15
1384 vrshrn.s32 d24, q12, #14
1385 vrshrn.s32 d25, q13, #14
1386 vrshrn.s32 d28, q14, #14
1387 vrshrn.s32 d29, q15, #14
1388 vaddw.u8 q10, q10, d0
1389 vaddw.u8 q12, q12, d0
1390 vaddw.u8 q14, q14, d0
DRC4346f912011-06-14 22:16:50 +00001391 vqmovun.s16 d1\g_offs, q10
1392 vqmovun.s16 d1\r_offs, q12
1393 vqmovun.s16 d1\b_offs, q14
DRC321e0682011-05-03 08:47:43 +00001394.endm
1395
DRC98a44fe2011-08-24 23:27:44 +00001396.macro do_yuv_to_rgb_stage2_store_load_stage1
1397 vld1.8 {d4}, [U, :64]!
1398 vrshrn.s32 d20, q10, #15
1399 vrshrn.s32 d21, q11, #15
1400 vrshrn.s32 d24, q12, #14
1401 vrshrn.s32 d25, q13, #14
1402 vrshrn.s32 d28, q14, #14
1403 vld1.8 {d5}, [V, :64]!
1404 vrshrn.s32 d29, q15, #14
1405 vaddw.u8 q10, q10, d0
1406 vaddw.u8 q12, q12, d0
1407 vaddw.u8 q14, q14, d0
1408 vqmovun.s16 d1\g_offs, q10
1409 vld1.8 {d0}, [Y, :64]!
1410 vqmovun.s16 d1\r_offs, q12
1411 pld [U, #64]
1412 pld [V, #64]
1413 pld [Y, #64]
1414 vqmovun.s16 d1\b_offs, q14
1415 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1416 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1417 do_store \bpp, 8
1418 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1419 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1420 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1421 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1422 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1423 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1424 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1425 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1426.endm
1427
1428.macro do_yuv_to_rgb
1429 do_yuv_to_rgb_stage1
1430 do_yuv_to_rgb_stage2
1431.endm
1432
DRC4346f912011-06-14 22:16:50 +00001433/* Apple gas crashes on adrl, work around that by using adr.
1434 * But this requires a copy of these constants for each function.
1435 */
1436
1437.balign 16
1438jsimd_ycc_\colorid\()_neon_consts:
1439 .short 0, 0, 0, 0
1440 .short 22971, -11277, -23401, 29033
1441 .short -128, -128, -128, -128
1442 .short -128, -128, -128, -128
1443
1444asm_function jsimd_ycc_\colorid\()_convert_neon
DRC321e0682011-05-03 08:47:43 +00001445 OUTPUT_WIDTH .req r0
1446 INPUT_BUF .req r1
1447 INPUT_ROW .req r2
1448 OUTPUT_BUF .req r3
1449 NUM_ROWS .req r4
1450
1451 INPUT_BUF0 .req r5
1452 INPUT_BUF1 .req r6
1453 INPUT_BUF2 .req INPUT_BUF
1454
1455 RGB .req r7
1456 Y .req r8
1457 U .req r9
1458 V .req r10
1459 N .req ip
1460
1461 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
DRC4346f912011-06-14 22:16:50 +00001462 adr ip, jsimd_ycc_\colorid\()_neon_consts
DRC321e0682011-05-03 08:47:43 +00001463 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1464
1465 /* Save ARM registers and handle input arguments */
1466 push {r4, r5, r6, r7, r8, r9, r10, lr}
1467 ldr NUM_ROWS, [sp, #(4 * 8)]
1468 ldr INPUT_BUF0, [INPUT_BUF]
1469 ldr INPUT_BUF1, [INPUT_BUF, #4]
1470 ldr INPUT_BUF2, [INPUT_BUF, #8]
1471 .unreq INPUT_BUF
1472
1473 /* Save NEON registers */
1474 vpush {d8-d15}
1475
1476 /* Initially set d10, d11, d12, d13 to 0xFF */
1477 vmov.u8 q5, #255
1478 vmov.u8 q6, #255
1479
1480 /* Outer loop over scanlines */
1481 cmp NUM_ROWS, #1
1482 blt 9f
14830:
1484 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1485 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1486 mov N, OUTPUT_WIDTH
1487 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1488 add INPUT_ROW, INPUT_ROW, #1
1489 ldr RGB, [OUTPUT_BUF], #4
1490
1491 /* Inner loop over pixels */
1492 subs N, N, #8
DRC98a44fe2011-08-24 23:27:44 +00001493 blt 3f
1494 do_load 8
1495 do_yuv_to_rgb_stage1
1496 subs N, N, #8
DRC321e0682011-05-03 08:47:43 +00001497 blt 2f
14981:
DRC98a44fe2011-08-24 23:27:44 +00001499 do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001500 subs N, N, #8
1501 bge 1b
DRC98a44fe2011-08-24 23:27:44 +000015022:
1503 do_yuv_to_rgb_stage2
1504 do_store \bpp, 8
DRC321e0682011-05-03 08:47:43 +00001505 tst N, #7
1506 beq 8f
DRC98a44fe2011-08-24 23:27:44 +000015073:
DRC321e0682011-05-03 08:47:43 +00001508 tst N, #4
1509 beq 3f
1510 do_load 4
15113:
1512 tst N, #2
1513 beq 4f
1514 do_load 2
15154:
1516 tst N, #1
1517 beq 5f
1518 do_load 1
15195:
1520 do_yuv_to_rgb
1521 tst N, #4
1522 beq 6f
DRC4346f912011-06-14 22:16:50 +00001523 do_store \bpp, 4
DRC321e0682011-05-03 08:47:43 +000015246:
1525 tst N, #2
1526 beq 7f
DRC4346f912011-06-14 22:16:50 +00001527 do_store \bpp, 2
DRC321e0682011-05-03 08:47:43 +000015287:
1529 tst N, #1
1530 beq 8f
DRC4346f912011-06-14 22:16:50 +00001531 do_store \bpp, 1
DRC321e0682011-05-03 08:47:43 +000015328:
1533 subs NUM_ROWS, NUM_ROWS, #1
1534 bgt 0b
15359:
1536 /* Restore all registers and return */
1537 vpop {d8-d15}
1538 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1539
1540 .unreq OUTPUT_WIDTH
1541 .unreq INPUT_ROW
1542 .unreq OUTPUT_BUF
1543 .unreq NUM_ROWS
1544 .unreq INPUT_BUF0
1545 .unreq INPUT_BUF1
1546 .unreq INPUT_BUF2
1547 .unreq RGB
1548 .unreq Y
1549 .unreq U
1550 .unreq V
1551 .unreq N
1552.endfunc
1553
1554.purgem do_yuv_to_rgb
DRC98a44fe2011-08-24 23:27:44 +00001555.purgem do_yuv_to_rgb_stage1
1556.purgem do_yuv_to_rgb_stage2
1557.purgem do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001558
1559.endm
1560
1561/*--------------------------------- id ----- bpp R G B */
1562generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1563generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1564generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1565generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1566generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1567generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1568
1569.purgem do_load
1570.purgem do_store
1571
1572/*****************************************************************************/
DRCb7400542011-08-10 23:31:13 +00001573
1574/*
DRC7a9376c2011-08-12 19:27:20 +00001575 * jsimd_extrgb_ycc_convert_neon
1576 * jsimd_extbgr_ycc_convert_neon
1577 * jsimd_extrgbx_ycc_convert_neon
1578 * jsimd_extbgrx_ycc_convert_neon
1579 * jsimd_extxbgr_ycc_convert_neon
1580 * jsimd_extxrgb_ycc_convert_neon
1581 *
1582 * Colorspace conversion RGB -> YCbCr
1583 */
1584
1585.macro do_store size
1586 .if \size == 8
1587 vst1.8 {d20}, [Y]!
1588 vst1.8 {d21}, [U]!
1589 vst1.8 {d22}, [V]!
1590 .elseif \size == 4
1591 vst1.8 {d20[0]}, [Y]!
1592 vst1.8 {d20[1]}, [Y]!
1593 vst1.8 {d20[2]}, [Y]!
1594 vst1.8 {d20[3]}, [Y]!
1595 vst1.8 {d21[0]}, [U]!
1596 vst1.8 {d21[1]}, [U]!
1597 vst1.8 {d21[2]}, [U]!
1598 vst1.8 {d21[3]}, [U]!
1599 vst1.8 {d22[0]}, [V]!
1600 vst1.8 {d22[1]}, [V]!
1601 vst1.8 {d22[2]}, [V]!
1602 vst1.8 {d22[3]}, [V]!
1603 .elseif \size == 2
1604 vst1.8 {d20[4]}, [Y]!
1605 vst1.8 {d20[5]}, [Y]!
1606 vst1.8 {d21[4]}, [U]!
1607 vst1.8 {d21[5]}, [U]!
1608 vst1.8 {d22[4]}, [V]!
1609 vst1.8 {d22[5]}, [V]!
1610 .elseif \size == 1
1611 vst1.8 {d20[6]}, [Y]!
1612 vst1.8 {d21[6]}, [U]!
1613 vst1.8 {d22[6]}, [V]!
1614 .else
1615 .error unsupported macroblock size
1616 .endif
1617.endm
1618
1619.macro do_load bpp, size
1620 .if \bpp == 24
1621 .if \size == 8
1622 vld3.8 {d10, d11, d12}, [RGB]!
1623 pld [RGB, #128]
1624 .elseif \size == 4
1625 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1626 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1627 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1628 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1629 .elseif \size == 2
1630 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1631 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1632 .elseif \size == 1
1633 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1634 .else
1635 .error unsupported macroblock size
1636 .endif
1637 .elseif \bpp == 32
1638 .if \size == 8
1639 vld4.8 {d10, d11, d12, d13}, [RGB]!
1640 pld [RGB, #128]
1641 .elseif \size == 4
1642 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1643 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1644 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1645 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1646 .elseif \size == 2
1647 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1648 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1649 .elseif \size == 1
1650 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1651 .else
1652 .error unsupported macroblock size
1653 .endif
1654 .else
1655 .error unsupported bpp
1656 .endif
1657.endm
1658
1659.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1660
1661/*
1662 * 2 stage pipelined RGB->YCbCr conversion
1663 */
1664
1665.macro do_rgb_to_yuv_stage1
1666 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1667 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1668 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1669 vmull.u16 q7, d4, d0[0]
1670 vmlal.u16 q7, d6, d0[1]
1671 vmlal.u16 q7, d8, d0[2]
1672 vmull.u16 q8, d5, d0[0]
1673 vmlal.u16 q8, d7, d0[1]
1674 vmlal.u16 q8, d9, d0[2]
1675 vrev64.32 q9, q1
1676 vrev64.32 q13, q1
1677 vmlsl.u16 q9, d4, d0[3]
1678 vmlsl.u16 q9, d6, d1[0]
1679 vmlal.u16 q9, d8, d1[1]
1680 vmlsl.u16 q13, d5, d0[3]
1681 vmlsl.u16 q13, d7, d1[0]
1682 vmlal.u16 q13, d9, d1[1]
1683 vrev64.32 q14, q1
1684 vrev64.32 q15, q1
1685 vmlal.u16 q14, d4, d1[1]
1686 vmlsl.u16 q14, d6, d1[2]
1687 vmlsl.u16 q14, d8, d1[3]
1688 vmlal.u16 q15, d5, d1[1]
1689 vmlsl.u16 q15, d7, d1[2]
1690 vmlsl.u16 q15, d9, d1[3]
1691.endm
1692
1693.macro do_rgb_to_yuv_stage2
1694 vrshrn.u32 d20, q7, #16
1695 vrshrn.u32 d21, q8, #16
1696 vshrn.u32 d22, q9, #16
1697 vshrn.u32 d23, q13, #16
1698 vshrn.u32 d24, q14, #16
1699 vshrn.u32 d25, q15, #16
1700 vmovn.u16 d20, q10 /* d20 = y */
1701 vmovn.u16 d21, q11 /* d21 = u */
1702 vmovn.u16 d22, q12 /* d22 = v */
1703.endm
1704
1705.macro do_rgb_to_yuv
1706 do_rgb_to_yuv_stage1
1707 do_rgb_to_yuv_stage2
1708.endm
1709
1710.macro do_rgb_to_yuv_stage2_store_load_stage1
1711 vrshrn.u32 d20, q7, #16
1712 vrshrn.u32 d21, q8, #16
1713 vshrn.u32 d22, q9, #16
1714 vrev64.32 q9, q1
1715 vshrn.u32 d23, q13, #16
1716 vrev64.32 q13, q1
1717 vshrn.u32 d24, q14, #16
1718 vshrn.u32 d25, q15, #16
1719 do_load \bpp, 8
1720 vmovn.u16 d20, q10 /* d20 = y */
1721 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1722 vmovn.u16 d21, q11 /* d21 = u */
1723 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1724 vmovn.u16 d22, q12 /* d22 = v */
1725 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1726 vmull.u16 q7, d4, d0[0]
1727 vmlal.u16 q7, d6, d0[1]
1728 vmlal.u16 q7, d8, d0[2]
1729 vst1.8 {d20}, [Y]!
1730 vmull.u16 q8, d5, d0[0]
1731 vmlal.u16 q8, d7, d0[1]
1732 vmlal.u16 q8, d9, d0[2]
1733 vmlsl.u16 q9, d4, d0[3]
1734 vmlsl.u16 q9, d6, d1[0]
1735 vmlal.u16 q9, d8, d1[1]
1736 vst1.8 {d21}, [U]!
1737 vmlsl.u16 q13, d5, d0[3]
1738 vmlsl.u16 q13, d7, d1[0]
1739 vmlal.u16 q13, d9, d1[1]
1740 vrev64.32 q14, q1
1741 vrev64.32 q15, q1
1742 vmlal.u16 q14, d4, d1[1]
1743 vmlsl.u16 q14, d6, d1[2]
1744 vmlsl.u16 q14, d8, d1[3]
1745 vst1.8 {d22}, [V]!
1746 vmlal.u16 q15, d5, d1[1]
1747 vmlsl.u16 q15, d7, d1[2]
1748 vmlsl.u16 q15, d9, d1[3]
1749.endm
1750
1751.balign 16
1752jsimd_\colorid\()_ycc_neon_consts:
1753 .short 19595, 38470, 7471, 11059
1754 .short 21709, 32768, 27439, 5329
1755 .short 32767, 128, 32767, 128
1756 .short 32767, 128, 32767, 128
1757
1758asm_function jsimd_\colorid\()_ycc_convert_neon
1759 OUTPUT_WIDTH .req r0
1760 INPUT_BUF .req r1
1761 OUTPUT_BUF .req r2
1762 OUTPUT_ROW .req r3
1763 NUM_ROWS .req r4
1764
1765 OUTPUT_BUF0 .req r5
1766 OUTPUT_BUF1 .req r6
1767 OUTPUT_BUF2 .req OUTPUT_BUF
1768
1769 RGB .req r7
1770 Y .req r8
1771 U .req r9
1772 V .req r10
1773 N .req ip
1774
1775 /* Load constants to d0, d1, d2, d3 */
1776 adr ip, jsimd_\colorid\()_ycc_neon_consts
1777 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1778
1779 /* Save ARM registers and handle input arguments */
1780 push {r4, r5, r6, r7, r8, r9, r10, lr}
1781 ldr NUM_ROWS, [sp, #(4 * 8)]
1782 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1783 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1784 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1785 .unreq OUTPUT_BUF
1786
1787 /* Save NEON registers */
1788 vpush {d8-d15}
1789
1790 /* Outer loop over scanlines */
1791 cmp NUM_ROWS, #1
1792 blt 9f
17930:
1794 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1795 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1796 mov N, OUTPUT_WIDTH
1797 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1798 add OUTPUT_ROW, OUTPUT_ROW, #1
1799 ldr RGB, [INPUT_BUF], #4
1800
1801 /* Inner loop over pixels */
1802 subs N, N, #8
1803 blt 3f
1804 do_load \bpp, 8
1805 do_rgb_to_yuv_stage1
1806 subs N, N, #8
1807 blt 2f
18081:
1809 do_rgb_to_yuv_stage2_store_load_stage1
1810 subs N, N, #8
1811 bge 1b
18122:
1813 do_rgb_to_yuv_stage2
1814 do_store 8
1815 tst N, #7
1816 beq 8f
18173:
1818 tst N, #4
1819 beq 3f
1820 do_load \bpp, 4
18213:
1822 tst N, #2
1823 beq 4f
1824 do_load \bpp, 2
18254:
1826 tst N, #1
1827 beq 5f
1828 do_load \bpp, 1
18295:
1830 do_rgb_to_yuv
1831 tst N, #4
1832 beq 6f
1833 do_store 4
18346:
1835 tst N, #2
1836 beq 7f
1837 do_store 2
18387:
1839 tst N, #1
1840 beq 8f
1841 do_store 1
18428:
1843 subs NUM_ROWS, NUM_ROWS, #1
1844 bgt 0b
18459:
1846 /* Restore all registers and return */
1847 vpop {d8-d15}
1848 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1849
1850 .unreq OUTPUT_WIDTH
1851 .unreq OUTPUT_ROW
1852 .unreq INPUT_BUF
1853 .unreq NUM_ROWS
1854 .unreq OUTPUT_BUF0
1855 .unreq OUTPUT_BUF1
1856 .unreq OUTPUT_BUF2
1857 .unreq RGB
1858 .unreq Y
1859 .unreq U
1860 .unreq V
1861 .unreq N
1862.endfunc
1863
1864.purgem do_rgb_to_yuv
1865.purgem do_rgb_to_yuv_stage1
1866.purgem do_rgb_to_yuv_stage2
1867.purgem do_rgb_to_yuv_stage2_store_load_stage1
1868
1869.endm
1870
1871/*--------------------------------- id ----- bpp R G B */
1872generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1873generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1874generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1875generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1876generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1877generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1878
1879.purgem do_load
1880.purgem do_store
1881
1882/*****************************************************************************/
1883
1884/*
DRCb7400542011-08-10 23:31:13 +00001885 * Load data into workspace, applying unsigned->signed conversion
1886 *
1887 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1888 * rid of VST1.16 instructions
1889 */
1890
1891asm_function jsimd_convsamp_neon
1892 SAMPLE_DATA .req r0
1893 START_COL .req r1
1894 WORKSPACE .req r2
1895 TMP1 .req r3
1896 TMP2 .req r4
1897 TMP3 .req r5
1898 TMP4 .req ip
1899
1900 push {r4, r5}
1901 vmov.u8 d0, #128
1902
1903 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1904 add TMP1, TMP1, START_COL
1905 add TMP2, TMP2, START_COL
1906 add TMP3, TMP3, START_COL
1907 add TMP4, TMP4, START_COL
1908 vld1.8 {d16}, [TMP1]
1909 vsubl.u8 q8, d16, d0
1910 vld1.8 {d18}, [TMP2]
1911 vsubl.u8 q9, d18, d0
1912 vld1.8 {d20}, [TMP3]
1913 vsubl.u8 q10, d20, d0
1914 vld1.8 {d22}, [TMP4]
1915 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1916 vsubl.u8 q11, d22, d0
1917 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1918 add TMP1, TMP1, START_COL
1919 add TMP2, TMP2, START_COL
1920 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1921 add TMP3, TMP3, START_COL
1922 add TMP4, TMP4, START_COL
1923 vld1.8 {d24}, [TMP1]
1924 vsubl.u8 q12, d24, d0
1925 vld1.8 {d26}, [TMP2]
1926 vsubl.u8 q13, d26, d0
1927 vld1.8 {d28}, [TMP3]
1928 vsubl.u8 q14, d28, d0
1929 vld1.8 {d30}, [TMP4]
1930 vsubl.u8 q15, d30, d0
1931 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1932 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1933 pop {r4, r5}
1934 bx lr
1935
1936 .unreq SAMPLE_DATA
1937 .unreq START_COL
1938 .unreq WORKSPACE
1939 .unreq TMP1
1940 .unreq TMP2
1941 .unreq TMP3
1942 .unreq TMP4
1943.endfunc
1944
1945/*****************************************************************************/
1946
1947/*
1948 * jsimd_fdct_ifast_neon
1949 *
1950 * This function contains a fast, not so accurate integer implementation of
1951 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1952 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1953 * function from jfdctfst.c
1954 *
1955 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1956 * rid of a bunch of VLD1.16 instructions
1957 */
1958
1959#define XFIX_0_382683433 d0[0]
1960#define XFIX_0_541196100 d0[1]
1961#define XFIX_0_707106781 d0[2]
1962#define XFIX_1_306562965 d0[3]
1963
1964.balign 16
1965jsimd_fdct_ifast_neon_consts:
1966 .short (98 * 128) /* XFIX_0_382683433 */
1967 .short (139 * 128) /* XFIX_0_541196100 */
1968 .short (181 * 128) /* XFIX_0_707106781 */
1969 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1970
1971asm_function jsimd_fdct_ifast_neon
1972
1973 DATA .req r0
1974 TMP .req ip
1975
1976 vpush {d8-d15}
1977
1978 /* Load constants */
1979 adr TMP, jsimd_fdct_ifast_neon_consts
1980 vld1.16 {d0}, [TMP, :64]
1981
1982 /* Load all DATA into NEON registers with the following allocation:
1983 * 0 1 2 3 | 4 5 6 7
1984 * ---------+--------
1985 * 0 | d16 | d17 | q8
1986 * 1 | d18 | d19 | q9
1987 * 2 | d20 | d21 | q10
1988 * 3 | d22 | d23 | q11
1989 * 4 | d24 | d25 | q12
1990 * 5 | d26 | d27 | q13
1991 * 6 | d28 | d29 | q14
1992 * 7 | d30 | d31 | q15
1993 */
1994
1995 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
1996 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
1997 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
1998 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
1999 sub DATA, DATA, #(128 - 32)
2000
2001 mov TMP, #2
20021:
2003 /* Transpose */
2004 vtrn.16 q12, q13
2005 vtrn.16 q10, q11
2006 vtrn.16 q8, q9
2007 vtrn.16 q14, q15
2008 vtrn.32 q9, q11
2009 vtrn.32 q13, q15
2010 vtrn.32 q8, q10
2011 vtrn.32 q12, q14
2012 vswp d30, d23
2013 vswp d24, d17
2014 vswp d26, d19
2015 /* 1-D FDCT */
2016 vadd.s16 q2, q11, q12
2017 vswp d28, d21
2018 vsub.s16 q12, q11, q12
2019 vsub.s16 q6, q10, q13
2020 vadd.s16 q10, q10, q13
2021 vsub.s16 q7, q9, q14
2022 vadd.s16 q9, q9, q14
2023 vsub.s16 q1, q8, q15
2024 vadd.s16 q8, q8, q15
2025 vsub.s16 q4, q9, q10
2026 vsub.s16 q5, q8, q2
2027 vadd.s16 q3, q9, q10
2028 vadd.s16 q4, q4, q5
2029 vadd.s16 q2, q8, q2
2030 vqdmulh.s16 q4, q4, XFIX_0_707106781
2031 vadd.s16 q11, q12, q6
2032 vadd.s16 q8, q2, q3
2033 vsub.s16 q12, q2, q3
2034 vadd.s16 q3, q6, q7
2035 vadd.s16 q7, q7, q1
2036 vqdmulh.s16 q3, q3, XFIX_0_707106781
2037 vsub.s16 q6, q11, q7
2038 vadd.s16 q10, q5, q4
2039 vqdmulh.s16 q6, q6, XFIX_0_382683433
2040 vsub.s16 q14, q5, q4
2041 vqdmulh.s16 q11, q11, XFIX_0_541196100
2042 vqdmulh.s16 q5, q7, XFIX_1_306562965
2043 vadd.s16 q4, q1, q3
2044 vsub.s16 q3, q1, q3
2045 vadd.s16 q7, q7, q6
2046 vadd.s16 q11, q11, q6
2047 vadd.s16 q7, q7, q5
2048 vadd.s16 q13, q3, q11
2049 vsub.s16 q11, q3, q11
2050 vadd.s16 q9, q4, q7
2051 vsub.s16 q15, q4, q7
2052 subs TMP, TMP, #1
2053 bne 1b
2054
2055 /* store results */
2056 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2057 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2058 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2059 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2060
2061 vpop {d8-d15}
2062 bx lr
2063
2064 .unreq DATA
2065 .unreq TMP
2066.endfunc
2067
2068/*****************************************************************************/
DRC82bd5212011-08-17 21:00:59 +00002069
2070/*
2071 * GLOBAL(void)
2072 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2073 * DCTELEM * workspace);
2074 *
2075 * Note: the code uses 2 stage pipelining in order to improve instructions
2076 * scheduling and eliminate stalls (this provides ~15% better
2077 * performance for this function on both ARM Cortex-A8 and
2078 * ARM Cortex-A9 when compared to the non-pipelined variant).
2079 * The instructions which belong to the second stage use different
2080 * indentation for better readiability.
2081 */
2082asm_function jsimd_quantize_neon
2083
2084 COEF_BLOCK .req r0
2085 DIVISORS .req r1
2086 WORKSPACE .req r2
2087
2088 RECIPROCAL .req DIVISORS
2089 CORRECTION .req r3
2090 SHIFT .req ip
2091 LOOP_COUNT .req r4
2092
2093 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2094 vabs.s16 q12, q0
2095 add CORRECTION, DIVISORS, #(64 * 2)
2096 add SHIFT, DIVISORS, #(64 * 6)
2097 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2098 vabs.s16 q13, q1
2099 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2100 vadd.u16 q12, q12, q10 /* add correction */
2101 vadd.u16 q13, q13, q11
2102 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2103 vmull.u16 q11, d25, d17
2104 vmull.u16 q8, d26, d18
2105 vmull.u16 q9, d27, d19
2106 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2107 vshrn.u32 d20, q10, #16
2108 vshrn.u32 d21, q11, #16
2109 vshrn.u32 d22, q8, #16
2110 vshrn.u32 d23, q9, #16
2111 vneg.s16 q12, q12
2112 vneg.s16 q13, q13
2113 vshr.s16 q2, q0, #15 /* extract sign */
2114 vshr.s16 q3, q1, #15
2115 vshl.u16 q14, q10, q12 /* shift */
2116 vshl.u16 q15, q11, q13
2117
2118 push {r4, r5}
2119 mov LOOP_COUNT, #3
21201:
2121 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2122 veor.u16 q14, q14, q2 /* restore sign */
2123 vabs.s16 q12, q0
2124 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2125 vabs.s16 q13, q1
2126 veor.u16 q15, q15, q3
2127 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2128 vadd.u16 q12, q12, q10 /* add correction */
2129 vadd.u16 q13, q13, q11
2130 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2131 vmull.u16 q11, d25, d17
2132 vmull.u16 q8, d26, d18
2133 vmull.u16 q9, d27, d19
2134 vsub.u16 q14, q14, q2
2135 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2136 vsub.u16 q15, q15, q3
2137 vshrn.u32 d20, q10, #16
2138 vshrn.u32 d21, q11, #16
2139 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2140 vshrn.u32 d22, q8, #16
2141 vshrn.u32 d23, q9, #16
2142 vneg.s16 q12, q12
2143 vneg.s16 q13, q13
2144 vshr.s16 q2, q0, #15 /* extract sign */
2145 vshr.s16 q3, q1, #15
2146 vshl.u16 q14, q10, q12 /* shift */
2147 vshl.u16 q15, q11, q13
2148 subs LOOP_COUNT, LOOP_COUNT, #1
2149 bne 1b
2150 pop {r4, r5}
2151
2152 veor.u16 q14, q14, q2 /* restore sign */
2153 veor.u16 q15, q15, q3
2154 vsub.u16 q14, q14, q2
2155 vsub.u16 q15, q15, q3
2156 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2157
2158 bx lr /* return */
2159
2160 .unreq COEF_BLOCK
2161 .unreq DIVISORS
2162 .unreq WORKSPACE
2163 .unreq RECIPROCAL
2164 .unreq CORRECTION
2165 .unreq SHIFT
2166 .unreq LOOP_COUNT
2167.endfunc