blob: 15c5409584039930423cc4ec2231750f86b28dd8 [file] [log] [blame]
DRC321e0682011-05-03 08:47:43 +00001/*
DRC3e00f032014-02-05 07:40:00 +00002 * ARMv7 NEON optimizations for libjpeg-turbo
DRC321e0682011-05-03 08:47:43 +00003 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
DRCb071f012011-09-06 18:58:22 +00006 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
DRCa6efae12014-08-25 15:26:09 +00007 * Copyright (C) 2014 Siarhei Siamashka. All Rights Reserved.
DRCd729f4d2014-08-23 15:47:51 +00008 * Copyright (C) 2014 Linaro Limited. All Rights Reserved.
DRC1e32fe32015-10-14 17:32:39 -05009 * Copyright (C) 2015 D. R. Commander. All Rights Reserved.
DRC499c4702016-01-13 03:13:20 -060010 * Copyright (C) 2015-2016 Matthieu Darbois. All Rights Reserved.
DRC321e0682011-05-03 08:47:43 +000011 *
12 * This software is provided 'as-is', without any express or implied
13 * warranty. In no event will the authors be held liable for any damages
14 * arising from the use of this software.
15 *
16 * Permission is granted to anyone to use this software for any purpose,
17 * including commercial applications, and to alter it and redistribute it
18 * freely, subject to the following restrictions:
19 *
20 * 1. The origin of this software must not be misrepresented; you must not
21 * claim that you wrote the original software. If you use this software
22 * in a product, an acknowledgment in the product documentation would be
23 * appreciated but is not required.
24 * 2. Altered source versions must be plainly marked as such, and must not be
25 * misrepresented as being the original software.
26 * 3. This notice may not be removed or altered from any source distribution.
27 */
28
29#if defined(__linux__) && defined(__ELF__)
DRCcf888482016-02-02 23:17:06 -060030.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
DRC321e0682011-05-03 08:47:43 +000031#endif
32
33.text
34.fpu neon
35.arch armv7a
36.object_arch armv4
DRC321e0682011-05-03 08:47:43 +000037.arm
38
DRC8c60d222011-06-17 21:12:58 +000039
40#define RESPECT_STRICT_ALIGNMENT 1
41
DRC3e00f032014-02-05 07:40:00 +000042
DRC321e0682011-05-03 08:47:43 +000043/*****************************************************************************/
44
45/* Supplementary macro for setting function attributes */
46.macro asm_function fname
DRC4346f912011-06-14 22:16:50 +000047#ifdef __APPLE__
DRC4346f912011-06-14 22:16:50 +000048 .globl _\fname
49_\fname:
50#else
DRC4346f912011-06-14 22:16:50 +000051 .global \fname
DRC321e0682011-05-03 08:47:43 +000052#ifdef __ELF__
DRC4346f912011-06-14 22:16:50 +000053 .hidden \fname
54 .type \fname, %function
DRC321e0682011-05-03 08:47:43 +000055#endif
DRC4346f912011-06-14 22:16:50 +000056\fname:
57#endif
DRC321e0682011-05-03 08:47:43 +000058.endm
59
60/* Transpose a block of 4x4 coefficients in four 64-bit registers */
61.macro transpose_4x4 x0, x1, x2, x3
DRCcf888482016-02-02 23:17:06 -060062 vtrn.16 \x0, \x1
63 vtrn.16 \x2, \x3
64 vtrn.32 \x0, \x2
65 vtrn.32 \x1, \x3
DRC321e0682011-05-03 08:47:43 +000066.endm
67
DRC3e00f032014-02-05 07:40:00 +000068
DRCce4e3e82011-08-22 13:48:01 +000069#define CENTERJSAMPLE 128
70
71/*****************************************************************************/
72
73/*
74 * Perform dequantization and inverse DCT on one block of coefficients.
75 *
76 * GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -060077 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
DRCce4e3e82011-08-22 13:48:01 +000078 * JSAMPARRAY output_buf, JDIMENSION output_col)
79 */
80
DRCcf888482016-02-02 23:17:06 -060081#define FIX_0_298631336 (2446)
82#define FIX_0_390180644 (3196)
83#define FIX_0_541196100 (4433)
84#define FIX_0_765366865 (6270)
85#define FIX_0_899976223 (7373)
86#define FIX_1_175875602 (9633)
87#define FIX_1_501321110 (12299)
88#define FIX_1_847759065 (15137)
89#define FIX_1_961570560 (16069)
90#define FIX_2_053119869 (16819)
91#define FIX_2_562915447 (20995)
92#define FIX_3_072711026 (25172)
DRCce4e3e82011-08-22 13:48:01 +000093
94#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
95#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
96#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
97#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
98#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
99#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
100#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
101#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
102
103/*
104 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
105 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
106 */
107#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
108{ \
109 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
DRC1e32fe32015-10-14 17:32:39 -0500110 JLONG q1, q2, q3, q4, q5, q6, q7; \
111 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
DRCce4e3e82011-08-22 13:48:01 +0000112 \
113 /* 1-D iDCT input data */ \
114 row0 = xrow0; \
115 row1 = xrow1; \
116 row2 = xrow2; \
117 row3 = xrow3; \
118 row4 = xrow4; \
119 row5 = xrow5; \
120 row6 = xrow6; \
121 row7 = xrow7; \
122 \
123 q5 = row7 + row3; \
124 q4 = row5 + row1; \
125 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
126 MULTIPLY(q4, FIX_1_175875602); \
127 q7 = MULTIPLY(q5, FIX_1_175875602) + \
128 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
129 q2 = MULTIPLY(row2, FIX_0_541196100) + \
130 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
131 q4 = q6; \
DRC1e32fe32015-10-14 17:32:39 -0500132 q3 = ((JLONG) row0 - (JLONG) row4) << 13; \
DRCce4e3e82011-08-22 13:48:01 +0000133 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
134 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
135 /* now we can use q1 (reloadable constants have been used up) */ \
136 q1 = q3 + q2; \
137 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
138 MULTIPLY(row1, -FIX_0_899976223); \
139 q5 = q7; \
140 q1 = q1 + q6; \
141 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
142 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
143 \
144 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
145 tmp11_plus_tmp2 = q1; \
146 row1 = 0; \
147 \
148 q1 = q1 - q6; \
149 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
150 MULTIPLY(row3, -FIX_2_562915447); \
151 q1 = q1 - q6; \
152 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
153 MULTIPLY(row6, FIX_0_541196100); \
154 q3 = q3 - q2; \
155 \
156 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
157 tmp11_minus_tmp2 = q1; \
158 \
DRC1e32fe32015-10-14 17:32:39 -0500159 q1 = ((JLONG) row0 + (JLONG) row4) << 13; \
DRCce4e3e82011-08-22 13:48:01 +0000160 q2 = q1 + q6; \
161 q1 = q1 - q6; \
162 \
163 /* pick up the results */ \
164 tmp0 = q4; \
165 tmp1 = q5; \
166 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
167 tmp3 = q7; \
168 tmp10 = q2; \
169 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
170 tmp12 = q3; \
171 tmp13 = q1; \
172}
173
DRCcf888482016-02-02 23:17:06 -0600174#define XFIX_0_899976223 d0[0]
175#define XFIX_0_541196100 d0[1]
176#define XFIX_2_562915447 d0[2]
177#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
178#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
179#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
180#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
181#define XFIX_1_175875602 d1[3]
182#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
183#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
184#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
185#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
DRCce4e3e82011-08-22 13:48:01 +0000186
187.balign 16
188jsimd_idct_islow_neon_consts:
DRCcf888482016-02-02 23:17:06 -0600189 .short FIX_0_899976223 /* d0[0] */
190 .short FIX_0_541196100 /* d0[1] */
191 .short FIX_2_562915447 /* d0[2] */
192 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
193 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
194 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
195 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
196 .short FIX_1_175875602 /* d1[3] */
197 /* reloadable constants */
198 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
199 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
200 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
201 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
DRCce4e3e82011-08-22 13:48:01 +0000202
203asm_function jsimd_idct_islow_neon
204
205 DCT_TABLE .req r0
206 COEF_BLOCK .req r1
207 OUTPUT_BUF .req r2
208 OUTPUT_COL .req r3
209 TMP1 .req r0
210 TMP2 .req r1
211 TMP3 .req r2
212 TMP4 .req ip
213
214 ROW0L .req d16
215 ROW0R .req d17
216 ROW1L .req d18
217 ROW1R .req d19
218 ROW2L .req d20
219 ROW2R .req d21
220 ROW3L .req d22
221 ROW3R .req d23
222 ROW4L .req d24
223 ROW4R .req d25
224 ROW5L .req d26
225 ROW5R .req d27
226 ROW6L .req d28
227 ROW6R .req d29
228 ROW7L .req d30
229 ROW7R .req d31
230
231 /* Load and dequantize coefficients into NEON registers
232 * with the following allocation:
233 * 0 1 2 3 | 4 5 6 7
234 * ---------+--------
235 * 0 | d16 | d17 ( q8 )
236 * 1 | d18 | d19 ( q9 )
237 * 2 | d20 | d21 ( q10 )
238 * 3 | d22 | d23 ( q11 )
239 * 4 | d24 | d25 ( q12 )
240 * 5 | d26 | d27 ( q13 )
241 * 6 | d28 | d29 ( q14 )
242 * 7 | d30 | d31 ( q15 )
243 */
244 adr ip, jsimd_idct_islow_neon_consts
245 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
246 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
247 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
248 vmul.s16 q8, q8, q0
249 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
250 vmul.s16 q9, q9, q1
251 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
252 vmul.s16 q10, q10, q2
253 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
254 vmul.s16 q11, q11, q3
255 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
256 vmul.s16 q12, q12, q0
257 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
258 vmul.s16 q14, q14, q2
259 vmul.s16 q13, q13, q1
DRCcf888482016-02-02 23:17:06 -0600260 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
DRCce4e3e82011-08-22 13:48:01 +0000261 add ip, ip, #16
262 vmul.s16 q15, q15, q3
DRCcf888482016-02-02 23:17:06 -0600263 vpush {d8-d15} /* save NEON registers */
DRCce4e3e82011-08-22 13:48:01 +0000264 /* 1-D IDCT, pass 1, left 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600265 vadd.s16 d4, ROW7L, ROW3L
266 vadd.s16 d5, ROW5L, ROW1L
267 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
268 vmlal.s16 q6, d5, XFIX_1_175875602
269 vmull.s16 q7, d4, XFIX_1_175875602
DRC5129e392011-09-06 18:55:45 +0000270 /* Check for the zero coefficients in the right 4x8 half */
271 push {r4, r5}
DRCcf888482016-02-02 23:17:06 -0600272 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
273 vsubl.s16 q3, ROW0L, ROW4L
274 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
275 vmull.s16 q2, ROW2L, XFIX_0_541196100
276 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
277 orr r0, r4, r5
278 vmov q4, q6
279 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
280 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
281 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
282 vshl.s32 q3, q3, #13
283 orr r0, r0, r4
284 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
285 orr r0, r0, r5
286 vadd.s32 q1, q3, q2
287 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
288 vmov q5, q7
289 vadd.s32 q1, q1, q6
290 orr r0, r0, r4
291 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
292 orr r0, r0, r5
293 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
294 vrshrn.s32 ROW1L, q1, #11
295 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
296 vsub.s32 q1, q1, q6
297 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
298 orr r0, r0, r4
299 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
300 orr r0, r0, r5
301 vsub.s32 q1, q1, q6
302 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
303 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
304 vmlal.s16 q6, ROW6L, XFIX_0_541196100
305 vsub.s32 q3, q3, q2
306 orr r0, r0, r4
307 vrshrn.s32 ROW6L, q1, #11
308 orr r0, r0, r5
309 vadd.s32 q1, q3, q5
310 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
311 vsub.s32 q3, q3, q5
312 vaddl.s16 q5, ROW0L, ROW4L
313 orr r0, r0, r4
314 vrshrn.s32 ROW2L, q1, #11
315 orr r0, r0, r5
316 vrshrn.s32 ROW5L, q3, #11
317 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
318 vshl.s32 q5, q5, #13
319 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
320 orr r0, r0, r4
321 vadd.s32 q2, q5, q6
322 orrs r0, r0, r5
323 vsub.s32 q1, q5, q6
324 vadd.s32 q6, q2, q7
325 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
326 vsub.s32 q2, q2, q7
327 vadd.s32 q5, q1, q4
328 orr r0, r4, r5
329 vsub.s32 q3, q1, q4
DRC5129e392011-09-06 18:55:45 +0000330 pop {r4, r5}
DRCcf888482016-02-02 23:17:06 -0600331 vrshrn.s32 ROW7L, q2, #11
332 vrshrn.s32 ROW3L, q5, #11
333 vrshrn.s32 ROW0L, q6, #11
334 vrshrn.s32 ROW4L, q3, #11
DRC5129e392011-09-06 18:55:45 +0000335
DRCcf888482016-02-02 23:17:06 -0600336 beq 3f /* Go to do some special handling for the sparse
337 right 4x8 half */
DRC5129e392011-09-06 18:55:45 +0000338
DRCce4e3e82011-08-22 13:48:01 +0000339 /* 1-D IDCT, pass 1, right 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600340 vld1.s16 {d2}, [ip, :64] /* reload constants */
341 vadd.s16 d10, ROW7R, ROW3R
342 vadd.s16 d8, ROW5R, ROW1R
DRCce4e3e82011-08-22 13:48:01 +0000343 /* Transpose left 4x8 half */
344 vtrn.16 ROW6L, ROW7L
DRCcf888482016-02-02 23:17:06 -0600345 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
346 vmlal.s16 q6, d8, XFIX_1_175875602
DRCce4e3e82011-08-22 13:48:01 +0000347 vtrn.16 ROW2L, ROW3L
DRCcf888482016-02-02 23:17:06 -0600348 vmull.s16 q7, d10, XFIX_1_175875602
349 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
DRCce4e3e82011-08-22 13:48:01 +0000350 vtrn.16 ROW0L, ROW1L
DRCcf888482016-02-02 23:17:06 -0600351 vsubl.s16 q3, ROW0R, ROW4R
352 vmull.s16 q2, ROW2R, XFIX_0_541196100
353 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
DRCce4e3e82011-08-22 13:48:01 +0000354 vtrn.16 ROW4L, ROW5L
DRCcf888482016-02-02 23:17:06 -0600355 vmov q4, q6
356 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
357 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
DRCce4e3e82011-08-22 13:48:01 +0000358 vtrn.32 ROW1L, ROW3L
DRCcf888482016-02-02 23:17:06 -0600359 vshl.s32 q3, q3, #13
360 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
DRCce4e3e82011-08-22 13:48:01 +0000361 vtrn.32 ROW4L, ROW6L
DRCcf888482016-02-02 23:17:06 -0600362 vadd.s32 q1, q3, q2
363 vmov q5, q7
364 vadd.s32 q1, q1, q6
DRCce4e3e82011-08-22 13:48:01 +0000365 vtrn.32 ROW0L, ROW2L
DRCcf888482016-02-02 23:17:06 -0600366 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
367 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
368 vrshrn.s32 ROW1R, q1, #11
DRCce4e3e82011-08-22 13:48:01 +0000369 vtrn.32 ROW5L, ROW7L
DRCcf888482016-02-02 23:17:06 -0600370 vsub.s32 q1, q1, q6
371 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
372 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
373 vsub.s32 q1, q1, q6
374 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
375 vmlal.s16 q6, ROW6R, XFIX_0_541196100
376 vsub.s32 q3, q3, q2
377 vrshrn.s32 ROW6R, q1, #11
378 vadd.s32 q1, q3, q5
379 vsub.s32 q3, q3, q5
380 vaddl.s16 q5, ROW0R, ROW4R
381 vrshrn.s32 ROW2R, q1, #11
382 vrshrn.s32 ROW5R, q3, #11
383 vshl.s32 q5, q5, #13
384 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
385 vadd.s32 q2, q5, q6
386 vsub.s32 q1, q5, q6
387 vadd.s32 q6, q2, q7
388 vsub.s32 q2, q2, q7
389 vadd.s32 q5, q1, q4
390 vsub.s32 q3, q1, q4
391 vrshrn.s32 ROW7R, q2, #11
392 vrshrn.s32 ROW3R, q5, #11
393 vrshrn.s32 ROW0R, q6, #11
394 vrshrn.s32 ROW4R, q3, #11
DRC5129e392011-09-06 18:55:45 +0000395 /* Transpose right 4x8 half */
396 vtrn.16 ROW6R, ROW7R
397 vtrn.16 ROW2R, ROW3R
398 vtrn.16 ROW0R, ROW1R
399 vtrn.16 ROW4R, ROW5R
400 vtrn.32 ROW1R, ROW3R
401 vtrn.32 ROW4R, ROW6R
402 vtrn.32 ROW0R, ROW2R
403 vtrn.32 ROW5R, ROW7R
404
4051: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600406 vld1.s16 {d2}, [ip, :64] /* reload constants */
407 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
408 vmlal.s16 q6, ROW1L, XFIX_1_175875602
409 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
410 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
411 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
412 vmlal.s16 q7, ROW3L, XFIX_1_175875602
413 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
414 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
415 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
416 vmull.s16 q2, ROW2L, XFIX_0_541196100
417 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
418 vmov q4, q6
419 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
420 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
421 vshl.s32 q3, q3, #13
422 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
423 vadd.s32 q1, q3, q2
424 vmov q5, q7
425 vadd.s32 q1, q1, q6
426 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
427 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
428 vshrn.s32 ROW1L, q1, #16
429 vsub.s32 q1, q1, q6
430 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
431 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
432 vsub.s32 q1, q1, q6
433 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
434 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
435 vsub.s32 q3, q3, q2
436 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
437 vadd.s32 q1, q3, q5
438 vsub.s32 q3, q3, q5
439 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
440 vshrn.s32 ROW2L, q1, #16
441 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
442 vshl.s32 q5, q5, #13
443 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
444 vadd.s32 q2, q5, q6
445 vsub.s32 q1, q5, q6
446 vadd.s32 q6, q2, q7
447 vsub.s32 q2, q2, q7
448 vadd.s32 q5, q1, q4
449 vsub.s32 q3, q1, q4
450 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
451 vshrn.s32 ROW3L, q5, #16
452 vshrn.s32 ROW0L, q6, #16
453 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000454 /* 1-D IDCT, pass 2, right 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600455 vld1.s16 {d2}, [ip, :64] /* reload constants */
456 vmull.s16 q6, ROW5R, XFIX_1_175875602
457 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
458 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
459 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
460 vmull.s16 q7, ROW7R, XFIX_1_175875602
461 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
462 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
463 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
464 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
465 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
466 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
467 vmov q4, q6
468 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
469 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
470 vshl.s32 q3, q3, #13
471 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
472 vadd.s32 q1, q3, q2
473 vmov q5, q7
474 vadd.s32 q1, q1, q6
475 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
476 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
477 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
478 vsub.s32 q1, q1, q6
479 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
480 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
481 vsub.s32 q1, q1, q6
482 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
483 vmlal.s16 q6, ROW6R, XFIX_0_541196100
484 vsub.s32 q3, q3, q2
485 vshrn.s32 ROW6R, q1, #16
486 vadd.s32 q1, q3, q5
487 vsub.s32 q3, q3, q5
488 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
489 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
490 vshrn.s32 ROW5R, q3, #16
491 vshl.s32 q5, q5, #13
492 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
493 vadd.s32 q2, q5, q6
494 vsub.s32 q1, q5, q6
495 vadd.s32 q6, q2, q7
496 vsub.s32 q2, q2, q7
497 vadd.s32 q5, q1, q4
498 vsub.s32 q3, q1, q4
499 vshrn.s32 ROW7R, q2, #16
500 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
501 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
502 vshrn.s32 ROW4R, q3, #16
DRC5129e392011-09-06 18:55:45 +0000503
5042: /* Descale to 8-bit and range limit */
DRCcf888482016-02-02 23:17:06 -0600505 vqrshrn.s16 d16, q8, #2
506 vqrshrn.s16 d17, q9, #2
507 vqrshrn.s16 d18, q10, #2
508 vqrshrn.s16 d19, q11, #2
509 vpop {d8-d15} /* restore NEON registers */
510 vqrshrn.s16 d20, q12, #2
DRC5129e392011-09-06 18:55:45 +0000511 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
DRCcf888482016-02-02 23:17:06 -0600512 vtrn.16 q8, q9
513 vqrshrn.s16 d21, q13, #2
514 vqrshrn.s16 d22, q14, #2
515 vmov.u8 q0, #(CENTERJSAMPLE)
516 vqrshrn.s16 d23, q15, #2
517 vtrn.8 d16, d17
518 vtrn.8 d18, d19
519 vadd.u8 q8, q8, q0
520 vadd.u8 q9, q9, q0
521 vtrn.16 q10, q11
DRC5129e392011-09-06 18:55:45 +0000522 /* Store results to the output buffer */
523 ldmia OUTPUT_BUF!, {TMP1, TMP2}
524 add TMP1, TMP1, OUTPUT_COL
525 add TMP2, TMP2, OUTPUT_COL
526 vst1.8 {d16}, [TMP1]
527 vtrn.8 d20, d21
528 vst1.8 {d17}, [TMP2]
529 ldmia OUTPUT_BUF!, {TMP1, TMP2}
530 add TMP1, TMP1, OUTPUT_COL
531 add TMP2, TMP2, OUTPUT_COL
532 vst1.8 {d18}, [TMP1]
DRCcf888482016-02-02 23:17:06 -0600533 vadd.u8 q10, q10, q0
DRC5129e392011-09-06 18:55:45 +0000534 vst1.8 {d19}, [TMP2]
535 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
536 add TMP1, TMP1, OUTPUT_COL
537 add TMP2, TMP2, OUTPUT_COL
538 add TMP3, TMP3, OUTPUT_COL
539 add TMP4, TMP4, OUTPUT_COL
540 vtrn.8 d22, d23
541 vst1.8 {d20}, [TMP1]
DRCcf888482016-02-02 23:17:06 -0600542 vadd.u8 q11, q11, q0
DRC5129e392011-09-06 18:55:45 +0000543 vst1.8 {d21}, [TMP2]
544 vst1.8 {d22}, [TMP3]
545 vst1.8 {d23}, [TMP4]
DRCce4e3e82011-08-22 13:48:01 +0000546 bx lr
547
DRC5129e392011-09-06 18:55:45 +00005483: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
549
550 /* Transpose left 4x8 half */
551 vtrn.16 ROW6L, ROW7L
552 vtrn.16 ROW2L, ROW3L
553 vtrn.16 ROW0L, ROW1L
554 vtrn.16 ROW4L, ROW5L
DRCcf888482016-02-02 23:17:06 -0600555 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
DRC5129e392011-09-06 18:55:45 +0000556 vtrn.32 ROW1L, ROW3L
557 vtrn.32 ROW4L, ROW6L
558 vtrn.32 ROW0L, ROW2L
559 vtrn.32 ROW5L, ROW7L
560
561 cmp r0, #0
DRCcf888482016-02-02 23:17:06 -0600562 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
563 pass */
DRC5129e392011-09-06 18:55:45 +0000564
565 /* Only row 0 is non-zero for the right 4x8 half */
566 vdup.s16 ROW1R, ROW0R[1]
567 vdup.s16 ROW2R, ROW0R[2]
568 vdup.s16 ROW3R, ROW0R[3]
569 vdup.s16 ROW4R, ROW0R[0]
570 vdup.s16 ROW5R, ROW0R[1]
571 vdup.s16 ROW6R, ROW0R[2]
572 vdup.s16 ROW7R, ROW0R[3]
573 vdup.s16 ROW0R, ROW0R[0]
DRCcf888482016-02-02 23:17:06 -0600574 b 1b /* Go to 'normal' second pass */
DRC5129e392011-09-06 18:55:45 +0000575
5764: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600577 vld1.s16 {d2}, [ip, :64] /* reload constants */
578 vmull.s16 q6, ROW1L, XFIX_1_175875602
579 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
580 vmull.s16 q7, ROW3L, XFIX_1_175875602
581 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
582 vmull.s16 q2, ROW2L, XFIX_0_541196100
583 vshll.s16 q3, ROW0L, #13
584 vmov q4, q6
585 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
586 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
587 vadd.s32 q1, q3, q2
588 vmov q5, q7
589 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
590 vadd.s32 q1, q1, q6
591 vadd.s32 q6, q6, q6
592 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
593 vshrn.s32 ROW1L, q1, #16
594 vsub.s32 q1, q1, q6
595 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
596 vsub.s32 q3, q3, q2
597 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
598 vadd.s32 q1, q3, q5
599 vsub.s32 q3, q3, q5
600 vshll.s16 q5, ROW0L, #13
601 vshrn.s32 ROW2L, q1, #16
602 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
603 vadd.s32 q2, q5, q6
604 vsub.s32 q1, q5, q6
605 vadd.s32 q6, q2, q7
606 vsub.s32 q2, q2, q7
607 vadd.s32 q5, q1, q4
608 vsub.s32 q3, q1, q4
609 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
610 vshrn.s32 ROW3L, q5, #16
611 vshrn.s32 ROW0L, q6, #16
612 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
DRC5129e392011-09-06 18:55:45 +0000613 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
DRCcf888482016-02-02 23:17:06 -0600614 vld1.s16 {d2}, [ip, :64] /* reload constants */
615 vmull.s16 q6, ROW5L, XFIX_1_175875602
616 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
617 vmull.s16 q7, ROW7L, XFIX_1_175875602
618 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
619 vmull.s16 q2, ROW6L, XFIX_0_541196100
620 vshll.s16 q3, ROW4L, #13
621 vmov q4, q6
622 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
623 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
624 vadd.s32 q1, q3, q2
625 vmov q5, q7
626 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
627 vadd.s32 q1, q1, q6
628 vadd.s32 q6, q6, q6
629 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
630 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
631 vsub.s32 q1, q1, q6
632 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
633 vsub.s32 q3, q3, q2
634 vshrn.s32 ROW6R, q1, #16
635 vadd.s32 q1, q3, q5
636 vsub.s32 q3, q3, q5
637 vshll.s16 q5, ROW4L, #13
638 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
639 vshrn.s32 ROW5R, q3, #16
640 vadd.s32 q2, q5, q6
641 vsub.s32 q1, q5, q6
642 vadd.s32 q6, q2, q7
643 vsub.s32 q2, q2, q7
644 vadd.s32 q5, q1, q4
645 vsub.s32 q3, q1, q4
646 vshrn.s32 ROW7R, q2, #16
647 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
648 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
649 vshrn.s32 ROW4R, q3, #16
650 b 2b /* Go to epilogue */
DRC5129e392011-09-06 18:55:45 +0000651
DRCce4e3e82011-08-22 13:48:01 +0000652 .unreq DCT_TABLE
653 .unreq COEF_BLOCK
654 .unreq OUTPUT_BUF
655 .unreq OUTPUT_COL
656 .unreq TMP1
657 .unreq TMP2
658 .unreq TMP3
659 .unreq TMP4
660
661 .unreq ROW0L
662 .unreq ROW0R
663 .unreq ROW1L
664 .unreq ROW1R
665 .unreq ROW2L
666 .unreq ROW2R
667 .unreq ROW3L
668 .unreq ROW3R
669 .unreq ROW4L
670 .unreq ROW4R
671 .unreq ROW5L
672 .unreq ROW5R
673 .unreq ROW6L
674 .unreq ROW6R
675 .unreq ROW7L
676 .unreq ROW7R
DRCce4e3e82011-08-22 13:48:01 +0000677
DRC3e00f032014-02-05 07:40:00 +0000678
DRC321e0682011-05-03 08:47:43 +0000679/*****************************************************************************/
680
681/*
682 * jsimd_idct_ifast_neon
683 *
684 * This function contains a fast, not so accurate integer implementation of
685 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
DRC4b024a62011-08-15 08:36:51 +0000686 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
DRC321e0682011-05-03 08:47:43 +0000687 * function from jidctfst.c
688 *
DRC4b024a62011-08-15 08:36:51 +0000689 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
690 * But in ARM NEON case some extra additions are required because VQDMULH
691 * instruction can't handle the constants larger than 1. So the expressions
692 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
693 * which introduces an extra addition. Overall, there are 6 extra additions
694 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
DRC321e0682011-05-03 08:47:43 +0000695 */
696
697#define XFIX_1_082392200 d0[0]
698#define XFIX_1_414213562 d0[1]
699#define XFIX_1_847759065 d0[2]
700#define XFIX_2_613125930 d0[3]
701
702.balign 16
703jsimd_idct_ifast_neon_consts:
DRCcf888482016-02-02 23:17:06 -0600704 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
705 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
706 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
707 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
DRC321e0682011-05-03 08:47:43 +0000708
DRC321e0682011-05-03 08:47:43 +0000709asm_function jsimd_idct_ifast_neon
710
711 DCT_TABLE .req r0
712 COEF_BLOCK .req r1
713 OUTPUT_BUF .req r2
714 OUTPUT_COL .req r3
DRC4b024a62011-08-15 08:36:51 +0000715 TMP1 .req r0
716 TMP2 .req r1
717 TMP3 .req r2
718 TMP4 .req ip
DRC321e0682011-05-03 08:47:43 +0000719
DRC4b024a62011-08-15 08:36:51 +0000720 /* Load and dequantize coefficients into NEON registers
721 * with the following allocation:
DRC321e0682011-05-03 08:47:43 +0000722 * 0 1 2 3 | 4 5 6 7
723 * ---------+--------
DRC4b024a62011-08-15 08:36:51 +0000724 * 0 | d16 | d17 ( q8 )
725 * 1 | d18 | d19 ( q9 )
726 * 2 | d20 | d21 ( q10 )
727 * 3 | d22 | d23 ( q11 )
728 * 4 | d24 | d25 ( q12 )
729 * 5 | d26 | d27 ( q13 )
730 * 6 | d28 | d29 ( q14 )
731 * 7 | d30 | d31 ( q15 )
DRC321e0682011-05-03 08:47:43 +0000732 */
DRC4b024a62011-08-15 08:36:51 +0000733 adr ip, jsimd_idct_ifast_neon_consts
734 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
735 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
736 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
DRCcf888482016-02-02 23:17:06 -0600737 vmul.s16 q8, q8, q0
DRC4b024a62011-08-15 08:36:51 +0000738 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
DRCcf888482016-02-02 23:17:06 -0600739 vmul.s16 q9, q9, q1
DRC4b024a62011-08-15 08:36:51 +0000740 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
741 vmul.s16 q10, q10, q2
742 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
743 vmul.s16 q11, q11, q3
744 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
745 vmul.s16 q12, q12, q0
746 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
747 vmul.s16 q14, q14, q2
748 vmul.s16 q13, q13, q1
DRCcf888482016-02-02 23:17:06 -0600749 vld1.16 {d0}, [ip, :64] /* load constants */
DRC4b024a62011-08-15 08:36:51 +0000750 vmul.s16 q15, q15, q3
DRCcf888482016-02-02 23:17:06 -0600751 vpush {d8-d13} /* save NEON registers */
DRC4b024a62011-08-15 08:36:51 +0000752 /* 1-D IDCT, pass 1 */
DRCcf888482016-02-02 23:17:06 -0600753 vsub.s16 q2, q10, q14
DRC4b024a62011-08-15 08:36:51 +0000754 vadd.s16 q14, q10, q14
DRCcf888482016-02-02 23:17:06 -0600755 vsub.s16 q1, q11, q13
DRC4b024a62011-08-15 08:36:51 +0000756 vadd.s16 q13, q11, q13
DRCcf888482016-02-02 23:17:06 -0600757 vsub.s16 q5, q9, q15
758 vadd.s16 q15, q9, q15
759 vqdmulh.s16 q4, q2, XFIX_1_414213562
760 vqdmulh.s16 q6, q1, XFIX_2_613125930
761 vadd.s16 q3, q1, q1
762 vsub.s16 q1, q5, q1
763 vadd.s16 q10, q2, q4
764 vqdmulh.s16 q4, q1, XFIX_1_847759065
765 vsub.s16 q2, q15, q13
766 vadd.s16 q3, q3, q6
767 vqdmulh.s16 q6, q2, XFIX_1_414213562
768 vadd.s16 q1, q1, q4
769 vqdmulh.s16 q4, q5, XFIX_1_082392200
DRC4b024a62011-08-15 08:36:51 +0000770 vsub.s16 q10, q10, q14
DRCcf888482016-02-02 23:17:06 -0600771 vadd.s16 q2, q2, q6
772 vsub.s16 q6, q8, q12
773 vadd.s16 q12, q8, q12
774 vadd.s16 q9, q5, q4
775 vadd.s16 q5, q6, q10
776 vsub.s16 q10, q6, q10
777 vadd.s16 q6, q15, q13
778 vadd.s16 q8, q12, q14
779 vsub.s16 q3, q6, q3
DRC4b024a62011-08-15 08:36:51 +0000780 vsub.s16 q12, q12, q14
DRCcf888482016-02-02 23:17:06 -0600781 vsub.s16 q3, q3, q1
782 vsub.s16 q1, q9, q1
783 vadd.s16 q2, q3, q2
784 vsub.s16 q15, q8, q6
785 vadd.s16 q1, q1, q2
786 vadd.s16 q8, q8, q6
787 vadd.s16 q14, q5, q3
788 vsub.s16 q9, q5, q3
DRC4b024a62011-08-15 08:36:51 +0000789 vsub.s16 q13, q10, q2
790 vadd.s16 q10, q10, q2
791 /* Transpose */
DRCcf888482016-02-02 23:17:06 -0600792 vtrn.16 q8, q9
DRC4b024a62011-08-15 08:36:51 +0000793 vsub.s16 q11, q12, q1
794 vtrn.16 q14, q15
795 vadd.s16 q12, q12, q1
796 vtrn.16 q10, q11
797 vtrn.16 q12, q13
DRCcf888482016-02-02 23:17:06 -0600798 vtrn.32 q9, q11
DRC4b024a62011-08-15 08:36:51 +0000799 vtrn.32 q12, q14
DRCcf888482016-02-02 23:17:06 -0600800 vtrn.32 q8, q10
DRC4b024a62011-08-15 08:36:51 +0000801 vtrn.32 q13, q15
802 vswp d28, d21
803 vswp d26, d19
804 /* 1-D IDCT, pass 2 */
DRCcf888482016-02-02 23:17:06 -0600805 vsub.s16 q2, q10, q14
DRC4b024a62011-08-15 08:36:51 +0000806 vswp d30, d23
807 vadd.s16 q14, q10, q14
808 vswp d24, d17
DRCcf888482016-02-02 23:17:06 -0600809 vsub.s16 q1, q11, q13
DRC4b024a62011-08-15 08:36:51 +0000810 vadd.s16 q13, q11, q13
DRCcf888482016-02-02 23:17:06 -0600811 vsub.s16 q5, q9, q15
812 vadd.s16 q15, q9, q15
813 vqdmulh.s16 q4, q2, XFIX_1_414213562
814 vqdmulh.s16 q6, q1, XFIX_2_613125930
815 vadd.s16 q3, q1, q1
816 vsub.s16 q1, q5, q1
817 vadd.s16 q10, q2, q4
818 vqdmulh.s16 q4, q1, XFIX_1_847759065
819 vsub.s16 q2, q15, q13
820 vadd.s16 q3, q3, q6
821 vqdmulh.s16 q6, q2, XFIX_1_414213562
822 vadd.s16 q1, q1, q4
823 vqdmulh.s16 q4, q5, XFIX_1_082392200
DRC4b024a62011-08-15 08:36:51 +0000824 vsub.s16 q10, q10, q14
DRCcf888482016-02-02 23:17:06 -0600825 vadd.s16 q2, q2, q6
826 vsub.s16 q6, q8, q12
827 vadd.s16 q12, q8, q12
828 vadd.s16 q9, q5, q4
829 vadd.s16 q5, q6, q10
830 vsub.s16 q10, q6, q10
831 vadd.s16 q6, q15, q13
832 vadd.s16 q8, q12, q14
833 vsub.s16 q3, q6, q3
DRC4b024a62011-08-15 08:36:51 +0000834 vsub.s16 q12, q12, q14
DRCcf888482016-02-02 23:17:06 -0600835 vsub.s16 q3, q3, q1
836 vsub.s16 q1, q9, q1
837 vadd.s16 q2, q3, q2
838 vsub.s16 q15, q8, q6
839 vadd.s16 q1, q1, q2
840 vadd.s16 q8, q8, q6
841 vadd.s16 q14, q5, q3
842 vsub.s16 q9, q5, q3
DRC4b024a62011-08-15 08:36:51 +0000843 vsub.s16 q13, q10, q2
DRCcf888482016-02-02 23:17:06 -0600844 vpop {d8-d13} /* restore NEON registers */
DRC4b024a62011-08-15 08:36:51 +0000845 vadd.s16 q10, q10, q2
DRC4b024a62011-08-15 08:36:51 +0000846 vsub.s16 q11, q12, q1
DRC4b024a62011-08-15 08:36:51 +0000847 vadd.s16 q12, q12, q1
DRCad6955d2011-09-06 18:57:53 +0000848 /* Descale to 8-bit and range limit */
DRCcf888482016-02-02 23:17:06 -0600849 vmov.u8 q0, #0x80
850 vqshrn.s16 d16, q8, #5
851 vqshrn.s16 d17, q9, #5
DRCad6955d2011-09-06 18:57:53 +0000852 vqshrn.s16 d18, q10, #5
853 vqshrn.s16 d19, q11, #5
854 vqshrn.s16 d20, q12, #5
855 vqshrn.s16 d21, q13, #5
856 vqshrn.s16 d22, q14, #5
857 vqshrn.s16 d23, q15, #5
DRCcf888482016-02-02 23:17:06 -0600858 vadd.u8 q8, q8, q0
859 vadd.u8 q9, q9, q0
DRCad6955d2011-09-06 18:57:53 +0000860 vadd.u8 q10, q10, q0
861 vadd.u8 q11, q11, q0
862 /* Transpose the final 8-bit samples */
DRCcf888482016-02-02 23:17:06 -0600863 vtrn.16 q8, q9
DRCad6955d2011-09-06 18:57:53 +0000864 vtrn.16 q10, q11
DRCcf888482016-02-02 23:17:06 -0600865 vtrn.32 q8, q10
866 vtrn.32 q9, q11
DRCad6955d2011-09-06 18:57:53 +0000867 vtrn.8 d16, d17
868 vtrn.8 d18, d19
DRC4b024a62011-08-15 08:36:51 +0000869 /* Store results to the output buffer */
870 ldmia OUTPUT_BUF!, {TMP1, TMP2}
871 add TMP1, TMP1, OUTPUT_COL
872 add TMP2, TMP2, OUTPUT_COL
DRC4b024a62011-08-15 08:36:51 +0000873 vst1.8 {d16}, [TMP1]
DRC4b024a62011-08-15 08:36:51 +0000874 vst1.8 {d17}, [TMP2]
DRC4b024a62011-08-15 08:36:51 +0000875 ldmia OUTPUT_BUF!, {TMP1, TMP2}
876 add TMP1, TMP1, OUTPUT_COL
877 add TMP2, TMP2, OUTPUT_COL
878 vst1.8 {d18}, [TMP1]
DRCad6955d2011-09-06 18:57:53 +0000879 vtrn.8 d20, d21
DRC4b024a62011-08-15 08:36:51 +0000880 vst1.8 {d19}, [TMP2]
DRC4b024a62011-08-15 08:36:51 +0000881 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
882 add TMP1, TMP1, OUTPUT_COL
883 add TMP2, TMP2, OUTPUT_COL
884 add TMP3, TMP3, OUTPUT_COL
885 add TMP4, TMP4, OUTPUT_COL
886 vst1.8 {d20}, [TMP1]
DRCad6955d2011-09-06 18:57:53 +0000887 vtrn.8 d22, d23
DRC4b024a62011-08-15 08:36:51 +0000888 vst1.8 {d21}, [TMP2]
889 vst1.8 {d22}, [TMP3]
890 vst1.8 {d23}, [TMP4]
DRC321e0682011-05-03 08:47:43 +0000891 bx lr
892
893 .unreq DCT_TABLE
894 .unreq COEF_BLOCK
895 .unreq OUTPUT_BUF
896 .unreq OUTPUT_COL
DRC4b024a62011-08-15 08:36:51 +0000897 .unreq TMP1
898 .unreq TMP2
899 .unreq TMP3
900 .unreq TMP4
DRC321e0682011-05-03 08:47:43 +0000901
DRC3e00f032014-02-05 07:40:00 +0000902
DRC321e0682011-05-03 08:47:43 +0000903/*****************************************************************************/
904
905/*
DRC8c60d222011-06-17 21:12:58 +0000906 * jsimd_idct_4x4_neon
907 *
908 * This function contains inverse-DCT code for getting reduced-size
909 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
910 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
911 * function from jpeg-6b (jidctred.c).
912 *
913 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
914 * requires much less arithmetic operations and hence should be faster.
915 * The primary purpose of this particular NEON optimized function is
916 * bit exact compatibility with jpeg-6b.
917 *
918 * TODO: a bit better instructions scheduling can be achieved by expanding
919 * idct_helper/transpose_4x4 macros and reordering instructions,
920 * but readability will suffer somewhat.
921 */
922
923#define CONST_BITS 13
924
DRCcf888482016-02-02 23:17:06 -0600925#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
926#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
927#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
928#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
929#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
930#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
931#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
932#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
933#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
934#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
935#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
936#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
937#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
938#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
DRC8c60d222011-06-17 21:12:58 +0000939
940.balign 16
941jsimd_idct_4x4_neon_consts:
DRCcf888482016-02-02 23:17:06 -0600942 .short FIX_1_847759065 /* d0[0] */
943 .short -FIX_0_765366865 /* d0[1] */
944 .short -FIX_0_211164243 /* d0[2] */
945 .short FIX_1_451774981 /* d0[3] */
946 .short -FIX_2_172734803 /* d1[0] */
947 .short FIX_1_061594337 /* d1[1] */
948 .short -FIX_0_509795579 /* d1[2] */
949 .short -FIX_0_601344887 /* d1[3] */
950 .short FIX_0_899976223 /* d2[0] */
951 .short FIX_2_562915447 /* d2[1] */
952 .short 1 << (CONST_BITS+1) /* d2[2] */
953 .short 0 /* d2[3] */
DRC8c60d222011-06-17 21:12:58 +0000954
955.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
DRCcf888482016-02-02 23:17:06 -0600956 vmull.s16 q14, \x4, d2[2]
957 vmlal.s16 q14, \x8, d0[0]
DRC8c60d222011-06-17 21:12:58 +0000958 vmlal.s16 q14, \x14, d0[1]
959
960 vmull.s16 q13, \x16, d1[2]
961 vmlal.s16 q13, \x12, d1[3]
962 vmlal.s16 q13, \x10, d2[0]
DRCcf888482016-02-02 23:17:06 -0600963 vmlal.s16 q13, \x6, d2[1]
DRC8c60d222011-06-17 21:12:58 +0000964
DRCcf888482016-02-02 23:17:06 -0600965 vmull.s16 q15, \x4, d2[2]
966 vmlsl.s16 q15, \x8, d0[0]
DRC8c60d222011-06-17 21:12:58 +0000967 vmlsl.s16 q15, \x14, d0[1]
968
969 vmull.s16 q12, \x16, d0[2]
970 vmlal.s16 q12, \x12, d0[3]
971 vmlal.s16 q12, \x10, d1[0]
DRCcf888482016-02-02 23:17:06 -0600972 vmlal.s16 q12, \x6, d1[1]
DRC8c60d222011-06-17 21:12:58 +0000973
974 vadd.s32 q10, q14, q13
975 vsub.s32 q14, q14, q13
976
DRCcf888482016-02-02 23:17:06 -0600977 .if \shift > 16
978 vrshr.s32 q10, q10, #\shift
979 vrshr.s32 q14, q14, #\shift
DRC8c60d222011-06-17 21:12:58 +0000980 vmovn.s32 \y26, q10
981 vmovn.s32 \y29, q14
DRCcf888482016-02-02 23:17:06 -0600982 .else
DRC8c60d222011-06-17 21:12:58 +0000983 vrshrn.s32 \y26, q10, #\shift
984 vrshrn.s32 \y29, q14, #\shift
DRCcf888482016-02-02 23:17:06 -0600985 .endif
DRC8c60d222011-06-17 21:12:58 +0000986
987 vadd.s32 q10, q15, q12
988 vsub.s32 q15, q15, q12
989
DRCcf888482016-02-02 23:17:06 -0600990 .if \shift > 16
991 vrshr.s32 q10, q10, #\shift
992 vrshr.s32 q15, q15, #\shift
DRC8c60d222011-06-17 21:12:58 +0000993 vmovn.s32 \y27, q10
994 vmovn.s32 \y28, q15
DRCcf888482016-02-02 23:17:06 -0600995 .else
DRC8c60d222011-06-17 21:12:58 +0000996 vrshrn.s32 \y27, q10, #\shift
997 vrshrn.s32 \y28, q15, #\shift
DRCcf888482016-02-02 23:17:06 -0600998 .endif
DRC8c60d222011-06-17 21:12:58 +0000999.endm
1000
1001asm_function jsimd_idct_4x4_neon
1002
1003 DCT_TABLE .req r0
1004 COEF_BLOCK .req r1
1005 OUTPUT_BUF .req r2
1006 OUTPUT_COL .req r3
1007 TMP1 .req r0
1008 TMP2 .req r1
1009 TMP3 .req r2
1010 TMP4 .req ip
1011
1012 vpush {d8-d15}
1013
1014 /* Load constants (d3 is just used for padding) */
1015 adr TMP4, jsimd_idct_4x4_neon_consts
1016 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1017
1018 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1019 * 0 1 2 3 | 4 5 6 7
1020 * ---------+--------
1021 * 0 | d4 | d5
1022 * 1 | d6 | d7
1023 * 2 | d8 | d9
1024 * 3 | d10 | d11
1025 * 4 | - | -
1026 * 5 | d12 | d13
1027 * 6 | d14 | d15
1028 * 7 | d16 | d17
1029 */
1030 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1031 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1032 add COEF_BLOCK, COEF_BLOCK, #16
1033 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1034 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1035 /* dequantize */
1036 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1037 vmul.s16 q2, q2, q9
1038 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1039 vmul.s16 q3, q3, q10
1040 vmul.s16 q4, q4, q11
1041 add DCT_TABLE, DCT_TABLE, #16
1042 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1043 vmul.s16 q5, q5, q12
1044 vmul.s16 q6, q6, q13
1045 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1046 vmul.s16 q7, q7, q14
1047 vmul.s16 q8, q8, q15
1048
1049 /* Pass 1 */
1050 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1051 transpose_4x4 d4, d6, d8, d10
1052 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1053 transpose_4x4 d5, d7, d9, d11
1054
1055 /* Pass 2 */
1056 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1057 transpose_4x4 d26, d27, d28, d29
1058
1059 /* Range limit */
1060 vmov.u16 q15, #0x80
1061 vadd.s16 q13, q13, q15
1062 vadd.s16 q14, q14, q15
1063 vqmovun.s16 d26, q13
1064 vqmovun.s16 d27, q14
1065
1066 /* Store results to the output buffer */
1067 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1068 add TMP1, TMP1, OUTPUT_COL
1069 add TMP2, TMP2, OUTPUT_COL
1070 add TMP3, TMP3, OUTPUT_COL
1071 add TMP4, TMP4, OUTPUT_COL
1072
1073#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1074 /* We can use much less instructions on little endian systems if the
1075 * OS kernel is not configured to trap unaligned memory accesses
1076 */
1077 vst1.32 {d26[0]}, [TMP1]!
1078 vst1.32 {d27[0]}, [TMP3]!
1079 vst1.32 {d26[1]}, [TMP2]!
1080 vst1.32 {d27[1]}, [TMP4]!
1081#else
1082 vst1.8 {d26[0]}, [TMP1]!
1083 vst1.8 {d27[0]}, [TMP3]!
1084 vst1.8 {d26[1]}, [TMP1]!
1085 vst1.8 {d27[1]}, [TMP3]!
1086 vst1.8 {d26[2]}, [TMP1]!
1087 vst1.8 {d27[2]}, [TMP3]!
1088 vst1.8 {d26[3]}, [TMP1]!
1089 vst1.8 {d27[3]}, [TMP3]!
1090
1091 vst1.8 {d26[4]}, [TMP2]!
1092 vst1.8 {d27[4]}, [TMP4]!
1093 vst1.8 {d26[5]}, [TMP2]!
1094 vst1.8 {d27[5]}, [TMP4]!
1095 vst1.8 {d26[6]}, [TMP2]!
1096 vst1.8 {d27[6]}, [TMP4]!
1097 vst1.8 {d26[7]}, [TMP2]!
1098 vst1.8 {d27[7]}, [TMP4]!
1099#endif
1100
1101 vpop {d8-d15}
1102 bx lr
1103
1104 .unreq DCT_TABLE
1105 .unreq COEF_BLOCK
1106 .unreq OUTPUT_BUF
1107 .unreq OUTPUT_COL
1108 .unreq TMP1
1109 .unreq TMP2
1110 .unreq TMP3
1111 .unreq TMP4
DRC8c60d222011-06-17 21:12:58 +00001112
1113.purgem idct_helper
1114
DRC3e00f032014-02-05 07:40:00 +00001115
DRC8c60d222011-06-17 21:12:58 +00001116/*****************************************************************************/
1117
1118/*
1119 * jsimd_idct_2x2_neon
1120 *
1121 * This function contains inverse-DCT code for getting reduced-size
1122 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1123 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1124 * function from jpeg-6b (jidctred.c).
1125 *
1126 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1127 * requires much less arithmetic operations and hence should be faster.
1128 * The primary purpose of this particular NEON optimized function is
1129 * bit exact compatibility with jpeg-6b.
1130 */
1131
1132.balign 8
1133jsimd_idct_2x2_neon_consts:
DRCcf888482016-02-02 23:17:06 -06001134 .short -FIX_0_720959822 /* d0[0] */
1135 .short FIX_0_850430095 /* d0[1] */
1136 .short -FIX_1_272758580 /* d0[2] */
1137 .short FIX_3_624509785 /* d0[3] */
DRC8c60d222011-06-17 21:12:58 +00001138
1139.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
DRCcf888482016-02-02 23:17:06 -06001140 vshll.s16 q14, \x4, #15
1141 vmull.s16 q13, \x6, d0[3]
1142 vmlal.s16 q13, \x10, d0[2]
1143 vmlal.s16 q13, \x12, d0[1]
1144 vmlal.s16 q13, \x16, d0[0]
DRC8c60d222011-06-17 21:12:58 +00001145
DRCcf888482016-02-02 23:17:06 -06001146 vadd.s32 q10, q14, q13
1147 vsub.s32 q14, q14, q13
DRC8c60d222011-06-17 21:12:58 +00001148
DRCcf888482016-02-02 23:17:06 -06001149 .if \shift > 16
1150 vrshr.s32 q10, q10, #\shift
1151 vrshr.s32 q14, q14, #\shift
1152 vmovn.s32 \y26, q10
1153 vmovn.s32 \y27, q14
1154 .else
1155 vrshrn.s32 \y26, q10, #\shift
1156 vrshrn.s32 \y27, q14, #\shift
1157 .endif
DRC8c60d222011-06-17 21:12:58 +00001158.endm
1159
1160asm_function jsimd_idct_2x2_neon
1161
1162 DCT_TABLE .req r0
1163 COEF_BLOCK .req r1
1164 OUTPUT_BUF .req r2
1165 OUTPUT_COL .req r3
1166 TMP1 .req r0
1167 TMP2 .req ip
1168
1169 vpush {d8-d15}
1170
1171 /* Load constants */
1172 adr TMP2, jsimd_idct_2x2_neon_consts
1173 vld1.16 {d0}, [TMP2, :64]
1174
1175 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1176 * 0 1 2 3 | 4 5 6 7
1177 * ---------+--------
1178 * 0 | d4 | d5
1179 * 1 | d6 | d7
1180 * 2 | - | -
1181 * 3 | d10 | d11
1182 * 4 | - | -
1183 * 5 | d12 | d13
1184 * 6 | - | -
1185 * 7 | d16 | d17
1186 */
1187 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1188 add COEF_BLOCK, COEF_BLOCK, #16
1189 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1190 add COEF_BLOCK, COEF_BLOCK, #16
1191 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1192 add COEF_BLOCK, COEF_BLOCK, #16
1193 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1194 /* Dequantize */
1195 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1196 vmul.s16 q2, q2, q9
1197 vmul.s16 q3, q3, q10
1198 add DCT_TABLE, DCT_TABLE, #16
1199 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1200 vmul.s16 q5, q5, q12
1201 add DCT_TABLE, DCT_TABLE, #16
1202 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1203 vmul.s16 q6, q6, q13
1204 add DCT_TABLE, DCT_TABLE, #16
1205 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1206 vmul.s16 q8, q8, q15
1207
1208 /* Pass 1 */
1209#if 0
1210 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
DRCcf888482016-02-02 23:17:06 -06001211 transpose_4x4 d4, d6, d8, d10
DRC8c60d222011-06-17 21:12:58 +00001212 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
DRCcf888482016-02-02 23:17:06 -06001213 transpose_4x4 d5, d7, d9, d11
DRC8c60d222011-06-17 21:12:58 +00001214#else
DRCcf888482016-02-02 23:17:06 -06001215 vmull.s16 q13, d6, d0[3]
DRC8c60d222011-06-17 21:12:58 +00001216 vmlal.s16 q13, d10, d0[2]
1217 vmlal.s16 q13, d12, d0[1]
1218 vmlal.s16 q13, d16, d0[0]
DRCcf888482016-02-02 23:17:06 -06001219 vmull.s16 q12, d7, d0[3]
DRC8c60d222011-06-17 21:12:58 +00001220 vmlal.s16 q12, d11, d0[2]
1221 vmlal.s16 q12, d13, d0[1]
1222 vmlal.s16 q12, d17, d0[0]
DRCcf888482016-02-02 23:17:06 -06001223 vshll.s16 q14, d4, #15
1224 vshll.s16 q15, d5, #15
DRC8c60d222011-06-17 21:12:58 +00001225 vadd.s32 q10, q14, q13
1226 vsub.s32 q14, q14, q13
DRCcf888482016-02-02 23:17:06 -06001227 vrshrn.s32 d4, q10, #13
1228 vrshrn.s32 d6, q14, #13
DRC8c60d222011-06-17 21:12:58 +00001229 vadd.s32 q10, q15, q12
1230 vsub.s32 q14, q15, q12
DRCcf888482016-02-02 23:17:06 -06001231 vrshrn.s32 d5, q10, #13
1232 vrshrn.s32 d7, q14, #13
1233 vtrn.16 q2, q3
1234 vtrn.32 q3, q5
DRC8c60d222011-06-17 21:12:58 +00001235#endif
1236
1237 /* Pass 2 */
1238 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1239
1240 /* Range limit */
1241 vmov.u16 q15, #0x80
1242 vadd.s16 q13, q13, q15
1243 vqmovun.s16 d26, q13
1244 vqmovun.s16 d27, q13
1245
1246 /* Store results to the output buffer */
1247 ldmia OUTPUT_BUF, {TMP1, TMP2}
1248 add TMP1, TMP1, OUTPUT_COL
1249 add TMP2, TMP2, OUTPUT_COL
1250
1251 vst1.8 {d26[0]}, [TMP1]!
1252 vst1.8 {d27[4]}, [TMP1]!
1253 vst1.8 {d26[1]}, [TMP2]!
1254 vst1.8 {d27[5]}, [TMP2]!
1255
1256 vpop {d8-d15}
1257 bx lr
1258
1259 .unreq DCT_TABLE
1260 .unreq COEF_BLOCK
1261 .unreq OUTPUT_BUF
1262 .unreq OUTPUT_COL
1263 .unreq TMP1
1264 .unreq TMP2
DRC8c60d222011-06-17 21:12:58 +00001265
1266.purgem idct_helper
1267
DRC3e00f032014-02-05 07:40:00 +00001268
DRC8c60d222011-06-17 21:12:58 +00001269/*****************************************************************************/
1270
1271/*
DRC321e0682011-05-03 08:47:43 +00001272 * jsimd_ycc_extrgb_convert_neon
1273 * jsimd_ycc_extbgr_convert_neon
1274 * jsimd_ycc_extrgbx_convert_neon
1275 * jsimd_ycc_extbgrx_convert_neon
1276 * jsimd_ycc_extxbgr_convert_neon
1277 * jsimd_ycc_extxrgb_convert_neon
1278 *
1279 * Colorspace conversion YCbCr -> RGB
1280 */
1281
DRC321e0682011-05-03 08:47:43 +00001282
1283.macro do_load size
DRCcf888482016-02-02 23:17:06 -06001284 .if \size == 8
1285 vld1.8 {d4}, [U, :64]!
1286 vld1.8 {d5}, [V, :64]!
1287 vld1.8 {d0}, [Y, :64]!
1288 pld [U, #64]
1289 pld [V, #64]
1290 pld [Y, #64]
1291 .elseif \size == 4
1292 vld1.8 {d4[0]}, [U]!
1293 vld1.8 {d4[1]}, [U]!
1294 vld1.8 {d4[2]}, [U]!
1295 vld1.8 {d4[3]}, [U]!
1296 vld1.8 {d5[0]}, [V]!
1297 vld1.8 {d5[1]}, [V]!
1298 vld1.8 {d5[2]}, [V]!
1299 vld1.8 {d5[3]}, [V]!
1300 vld1.8 {d0[0]}, [Y]!
1301 vld1.8 {d0[1]}, [Y]!
1302 vld1.8 {d0[2]}, [Y]!
1303 vld1.8 {d0[3]}, [Y]!
1304 .elseif \size == 2
1305 vld1.8 {d4[4]}, [U]!
1306 vld1.8 {d4[5]}, [U]!
1307 vld1.8 {d5[4]}, [V]!
1308 vld1.8 {d5[5]}, [V]!
1309 vld1.8 {d0[4]}, [Y]!
1310 vld1.8 {d0[5]}, [Y]!
1311 .elseif \size == 1
1312 vld1.8 {d4[6]}, [U]!
1313 vld1.8 {d5[6]}, [V]!
1314 vld1.8 {d0[6]}, [Y]!
1315 .else
1316 .error unsupported macroblock size
1317 .endif
DRC321e0682011-05-03 08:47:43 +00001318.endm
1319
1320.macro do_store bpp, size
DRCcf888482016-02-02 23:17:06 -06001321 .if \bpp == 24
1322 .if \size == 8
1323 vst3.8 {d10, d11, d12}, [RGB]!
1324 .elseif \size == 4
1325 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1326 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1327 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1328 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1329 .elseif \size == 2
1330 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1331 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1332 .elseif \size == 1
1333 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
DRC321e0682011-05-03 08:47:43 +00001334 .else
DRCcf888482016-02-02 23:17:06 -06001335 .error unsupported macroblock size
DRC321e0682011-05-03 08:47:43 +00001336 .endif
DRCcf888482016-02-02 23:17:06 -06001337 .elseif \bpp == 32
1338 .if \size == 8
1339 vst4.8 {d10, d11, d12, d13}, [RGB]!
1340 .elseif \size == 4
1341 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1342 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1343 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1344 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1345 .elseif \size == 2
1346 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1347 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1348 .elseif \size == 1
1349 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1350 .else
1351 .error unsupported macroblock size
1352 .endif
1353 .elseif \bpp == 16
1354 .if \size == 8
1355 vst1.16 {q15}, [RGB]!
1356 .elseif \size == 4
1357 vst1.16 {d30}, [RGB]!
1358 .elseif \size == 2
1359 vst1.16 {d31[0]}, [RGB]!
1360 vst1.16 {d31[1]}, [RGB]!
1361 .elseif \size == 1
1362 vst1.16 {d31[2]}, [RGB]!
1363 .else
1364 .error unsupported macroblock size
1365 .endif
1366 .else
1367 .error unsupported bpp
1368 .endif
DRC321e0682011-05-03 08:47:43 +00001369.endm
1370
1371.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1372
DRC98a44fe2011-08-24 23:27:44 +00001373/*
DRCcf888482016-02-02 23:17:06 -06001374 * 2-stage pipelined YCbCr->RGB conversion
DRC98a44fe2011-08-24 23:27:44 +00001375 */
1376
1377.macro do_yuv_to_rgb_stage1
DRCcf888482016-02-02 23:17:06 -06001378 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1379 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1380 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1381 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1382 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1383 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1384 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1385 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1386 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1387 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
DRC98a44fe2011-08-24 23:27:44 +00001388.endm
1389
1390.macro do_yuv_to_rgb_stage2
DRC321e0682011-05-03 08:47:43 +00001391 vrshrn.s32 d20, q10, #15
1392 vrshrn.s32 d21, q11, #15
1393 vrshrn.s32 d24, q12, #14
1394 vrshrn.s32 d25, q13, #14
1395 vrshrn.s32 d28, q14, #14
1396 vrshrn.s32 d29, q15, #14
DRCd729f4d2014-08-23 15:47:51 +00001397 vaddw.u8 q11, q10, d0
DRC321e0682011-05-03 08:47:43 +00001398 vaddw.u8 q12, q12, d0
1399 vaddw.u8 q14, q14, d0
DRCcf888482016-02-02 23:17:06 -06001400 .if \bpp != 16
DRCd729f4d2014-08-23 15:47:51 +00001401 vqmovun.s16 d1\g_offs, q11
DRC4346f912011-06-14 22:16:50 +00001402 vqmovun.s16 d1\r_offs, q12
1403 vqmovun.s16 d1\b_offs, q14
DRCcf888482016-02-02 23:17:06 -06001404 .else /* rgb565 */
DRCd729f4d2014-08-23 15:47:51 +00001405 vqshlu.s16 q13, q11, #8
1406 vqshlu.s16 q15, q12, #8
1407 vqshlu.s16 q14, q14, #8
1408 vsri.u16 q15, q13, #5
1409 vsri.u16 q15, q14, #11
DRCcf888482016-02-02 23:17:06 -06001410 .endif
DRC321e0682011-05-03 08:47:43 +00001411.endm
1412
DRC98a44fe2011-08-24 23:27:44 +00001413.macro do_yuv_to_rgb_stage2_store_load_stage1
DRCa6efae12014-08-25 15:26:09 +00001414 /* "do_yuv_to_rgb_stage2" and "store" */
1415 vrshrn.s32 d20, q10, #15
DRCd729f4d2014-08-23 15:47:51 +00001416 /* "load" and "do_yuv_to_rgb_stage1" */
1417 pld [U, #64]
DRCa6efae12014-08-25 15:26:09 +00001418 vrshrn.s32 d21, q11, #15
DRCd729f4d2014-08-23 15:47:51 +00001419 pld [V, #64]
DRCa6efae12014-08-25 15:26:09 +00001420 vrshrn.s32 d24, q12, #14
1421 vrshrn.s32 d25, q13, #14
DRCd729f4d2014-08-23 15:47:51 +00001422 vld1.8 {d4}, [U, :64]!
DRCa6efae12014-08-25 15:26:09 +00001423 vrshrn.s32 d28, q14, #14
DRC98a44fe2011-08-24 23:27:44 +00001424 vld1.8 {d5}, [V, :64]!
DRCa6efae12014-08-25 15:26:09 +00001425 vrshrn.s32 d29, q15, #14
DRCcf888482016-02-02 23:17:06 -06001426 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1427 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
DRCa6efae12014-08-25 15:26:09 +00001428 vaddw.u8 q11, q10, d0
DRCcf888482016-02-02 23:17:06 -06001429 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1430 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
DRCa6efae12014-08-25 15:26:09 +00001431 vaddw.u8 q12, q12, d0
1432 vaddw.u8 q14, q14, d0
DRCcf888482016-02-02 23:17:06 -06001433 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
DRCa6efae12014-08-25 15:26:09 +00001434 vqmovun.s16 d1\g_offs, q11
DRCd729f4d2014-08-23 15:47:51 +00001435 pld [Y, #64]
DRCa6efae12014-08-25 15:26:09 +00001436 vqmovun.s16 d1\r_offs, q12
DRCd729f4d2014-08-23 15:47:51 +00001437 vld1.8 {d0}, [Y, :64]!
DRCa6efae12014-08-25 15:26:09 +00001438 vqmovun.s16 d1\b_offs, q14
DRCcf888482016-02-02 23:17:06 -06001439 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1440 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
DRCa6efae12014-08-25 15:26:09 +00001441 do_store \bpp, 8
DRCcf888482016-02-02 23:17:06 -06001442 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1443 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1444 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1445 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1446 .else /**************************** rgb565 ********************************/
DRCa6efae12014-08-25 15:26:09 +00001447 vqshlu.s16 q13, q11, #8
DRCd729f4d2014-08-23 15:47:51 +00001448 pld [Y, #64]
DRCa6efae12014-08-25 15:26:09 +00001449 vqshlu.s16 q15, q12, #8
1450 vqshlu.s16 q14, q14, #8
DRCd729f4d2014-08-23 15:47:51 +00001451 vld1.8 {d0}, [Y, :64]!
1452 vmull.s16 q11, d7, d1[1]
1453 vmlal.s16 q11, d9, d1[2]
DRCa6efae12014-08-25 15:26:09 +00001454 vsri.u16 q15, q13, #5
DRCd729f4d2014-08-23 15:47:51 +00001455 vmull.s16 q12, d8, d1[0]
DRCa6efae12014-08-25 15:26:09 +00001456 vsri.u16 q15, q14, #11
DRCd729f4d2014-08-23 15:47:51 +00001457 vmull.s16 q13, d9, d1[0]
1458 vmull.s16 q14, d6, d1[3]
DRCa6efae12014-08-25 15:26:09 +00001459 do_store \bpp, 8
DRCd729f4d2014-08-23 15:47:51 +00001460 vmull.s16 q15, d7, d1[3]
DRCcf888482016-02-02 23:17:06 -06001461 .endif
DRC98a44fe2011-08-24 23:27:44 +00001462.endm
1463
1464.macro do_yuv_to_rgb
1465 do_yuv_to_rgb_stage1
1466 do_yuv_to_rgb_stage2
1467.endm
1468
DRC4346f912011-06-14 22:16:50 +00001469/* Apple gas crashes on adrl, work around that by using adr.
1470 * But this requires a copy of these constants for each function.
1471 */
1472
1473.balign 16
1474jsimd_ycc_\colorid\()_neon_consts:
DRCcf888482016-02-02 23:17:06 -06001475 .short 0, 0, 0, 0
1476 .short 22971, -11277, -23401, 29033
1477 .short -128, -128, -128, -128
1478 .short -128, -128, -128, -128
DRC4346f912011-06-14 22:16:50 +00001479
1480asm_function jsimd_ycc_\colorid\()_convert_neon
DRC321e0682011-05-03 08:47:43 +00001481 OUTPUT_WIDTH .req r0
1482 INPUT_BUF .req r1
1483 INPUT_ROW .req r2
1484 OUTPUT_BUF .req r3
1485 NUM_ROWS .req r4
1486
1487 INPUT_BUF0 .req r5
1488 INPUT_BUF1 .req r6
1489 INPUT_BUF2 .req INPUT_BUF
1490
1491 RGB .req r7
1492 Y .req r8
1493 U .req r9
1494 V .req r10
1495 N .req ip
1496
1497 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
DRC4346f912011-06-14 22:16:50 +00001498 adr ip, jsimd_ycc_\colorid\()_neon_consts
DRC321e0682011-05-03 08:47:43 +00001499 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1500
1501 /* Save ARM registers and handle input arguments */
1502 push {r4, r5, r6, r7, r8, r9, r10, lr}
1503 ldr NUM_ROWS, [sp, #(4 * 8)]
1504 ldr INPUT_BUF0, [INPUT_BUF]
1505 ldr INPUT_BUF1, [INPUT_BUF, #4]
1506 ldr INPUT_BUF2, [INPUT_BUF, #8]
1507 .unreq INPUT_BUF
1508
1509 /* Save NEON registers */
1510 vpush {d8-d15}
1511
1512 /* Initially set d10, d11, d12, d13 to 0xFF */
1513 vmov.u8 q5, #255
1514 vmov.u8 q6, #255
1515
1516 /* Outer loop over scanlines */
1517 cmp NUM_ROWS, #1
1518 blt 9f
15190:
1520 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1521 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1522 mov N, OUTPUT_WIDTH
1523 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1524 add INPUT_ROW, INPUT_ROW, #1
1525 ldr RGB, [OUTPUT_BUF], #4
1526
1527 /* Inner loop over pixels */
1528 subs N, N, #8
DRC98a44fe2011-08-24 23:27:44 +00001529 blt 3f
1530 do_load 8
1531 do_yuv_to_rgb_stage1
1532 subs N, N, #8
DRC321e0682011-05-03 08:47:43 +00001533 blt 2f
15341:
DRC98a44fe2011-08-24 23:27:44 +00001535 do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001536 subs N, N, #8
1537 bge 1b
DRC98a44fe2011-08-24 23:27:44 +000015382:
1539 do_yuv_to_rgb_stage2
1540 do_store \bpp, 8
DRC321e0682011-05-03 08:47:43 +00001541 tst N, #7
1542 beq 8f
DRC98a44fe2011-08-24 23:27:44 +000015433:
DRC321e0682011-05-03 08:47:43 +00001544 tst N, #4
1545 beq 3f
1546 do_load 4
15473:
1548 tst N, #2
1549 beq 4f
1550 do_load 2
15514:
1552 tst N, #1
1553 beq 5f
1554 do_load 1
15555:
1556 do_yuv_to_rgb
1557 tst N, #4
1558 beq 6f
DRC4346f912011-06-14 22:16:50 +00001559 do_store \bpp, 4
DRC321e0682011-05-03 08:47:43 +000015606:
1561 tst N, #2
1562 beq 7f
DRC4346f912011-06-14 22:16:50 +00001563 do_store \bpp, 2
DRC321e0682011-05-03 08:47:43 +000015647:
1565 tst N, #1
1566 beq 8f
DRC4346f912011-06-14 22:16:50 +00001567 do_store \bpp, 1
DRC321e0682011-05-03 08:47:43 +000015688:
1569 subs NUM_ROWS, NUM_ROWS, #1
1570 bgt 0b
15719:
1572 /* Restore all registers and return */
1573 vpop {d8-d15}
1574 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1575
1576 .unreq OUTPUT_WIDTH
1577 .unreq INPUT_ROW
1578 .unreq OUTPUT_BUF
1579 .unreq NUM_ROWS
1580 .unreq INPUT_BUF0
1581 .unreq INPUT_BUF1
1582 .unreq INPUT_BUF2
1583 .unreq RGB
1584 .unreq Y
1585 .unreq U
1586 .unreq V
1587 .unreq N
DRC321e0682011-05-03 08:47:43 +00001588
1589.purgem do_yuv_to_rgb
DRC98a44fe2011-08-24 23:27:44 +00001590.purgem do_yuv_to_rgb_stage1
1591.purgem do_yuv_to_rgb_stage2
1592.purgem do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001593
1594.endm
1595
1596/*--------------------------------- id ----- bpp R G B */
1597generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1598generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1599generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1600generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1601generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1602generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
DRCd729f4d2014-08-23 15:47:51 +00001603generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
DRC321e0682011-05-03 08:47:43 +00001604
1605.purgem do_load
1606.purgem do_store
1607
DRC3e00f032014-02-05 07:40:00 +00001608
DRC321e0682011-05-03 08:47:43 +00001609/*****************************************************************************/
DRCb7400542011-08-10 23:31:13 +00001610
1611/*
DRC7a9376c2011-08-12 19:27:20 +00001612 * jsimd_extrgb_ycc_convert_neon
1613 * jsimd_extbgr_ycc_convert_neon
1614 * jsimd_extrgbx_ycc_convert_neon
1615 * jsimd_extbgrx_ycc_convert_neon
1616 * jsimd_extxbgr_ycc_convert_neon
1617 * jsimd_extxrgb_ycc_convert_neon
1618 *
1619 * Colorspace conversion RGB -> YCbCr
1620 */
1621
1622.macro do_store size
DRCcf888482016-02-02 23:17:06 -06001623 .if \size == 8
1624 vst1.8 {d20}, [Y]!
1625 vst1.8 {d21}, [U]!
1626 vst1.8 {d22}, [V]!
1627 .elseif \size == 4
1628 vst1.8 {d20[0]}, [Y]!
1629 vst1.8 {d20[1]}, [Y]!
1630 vst1.8 {d20[2]}, [Y]!
1631 vst1.8 {d20[3]}, [Y]!
1632 vst1.8 {d21[0]}, [U]!
1633 vst1.8 {d21[1]}, [U]!
1634 vst1.8 {d21[2]}, [U]!
1635 vst1.8 {d21[3]}, [U]!
1636 vst1.8 {d22[0]}, [V]!
1637 vst1.8 {d22[1]}, [V]!
1638 vst1.8 {d22[2]}, [V]!
1639 vst1.8 {d22[3]}, [V]!
1640 .elseif \size == 2
1641 vst1.8 {d20[4]}, [Y]!
1642 vst1.8 {d20[5]}, [Y]!
1643 vst1.8 {d21[4]}, [U]!
1644 vst1.8 {d21[5]}, [U]!
1645 vst1.8 {d22[4]}, [V]!
1646 vst1.8 {d22[5]}, [V]!
1647 .elseif \size == 1
1648 vst1.8 {d20[6]}, [Y]!
1649 vst1.8 {d21[6]}, [U]!
1650 vst1.8 {d22[6]}, [V]!
1651 .else
1652 .error unsupported macroblock size
1653 .endif
DRC7a9376c2011-08-12 19:27:20 +00001654.endm
1655
1656.macro do_load bpp, size
DRCcf888482016-02-02 23:17:06 -06001657 .if \bpp == 24
1658 .if \size == 8
1659 vld3.8 {d10, d11, d12}, [RGB]!
1660 pld [RGB, #128]
1661 .elseif \size == 4
1662 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1663 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1664 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1665 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1666 .elseif \size == 2
1667 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1668 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1669 .elseif \size == 1
1670 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
DRC7a9376c2011-08-12 19:27:20 +00001671 .else
DRCcf888482016-02-02 23:17:06 -06001672 .error unsupported macroblock size
DRC7a9376c2011-08-12 19:27:20 +00001673 .endif
DRCcf888482016-02-02 23:17:06 -06001674 .elseif \bpp == 32
1675 .if \size == 8
1676 vld4.8 {d10, d11, d12, d13}, [RGB]!
1677 pld [RGB, #128]
1678 .elseif \size == 4
1679 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1680 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1681 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1682 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1683 .elseif \size == 2
1684 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1685 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1686 .elseif \size == 1
1687 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1688 .else
1689 .error unsupported macroblock size
1690 .endif
1691 .else
1692 .error unsupported bpp
1693 .endif
DRC7a9376c2011-08-12 19:27:20 +00001694.endm
1695
1696.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1697
1698/*
DRCcf888482016-02-02 23:17:06 -06001699 * 2-stage pipelined RGB->YCbCr conversion
DRC7a9376c2011-08-12 19:27:20 +00001700 */
1701
1702.macro do_rgb_to_yuv_stage1
DRCcf888482016-02-02 23:17:06 -06001703 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1704 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1705 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1706 vmull.u16 q7, d4, d0[0]
1707 vmlal.u16 q7, d6, d0[1]
1708 vmlal.u16 q7, d8, d0[2]
1709 vmull.u16 q8, d5, d0[0]
1710 vmlal.u16 q8, d7, d0[1]
1711 vmlal.u16 q8, d9, d0[2]
1712 vrev64.32 q9, q1
1713 vrev64.32 q13, q1
1714 vmlsl.u16 q9, d4, d0[3]
1715 vmlsl.u16 q9, d6, d1[0]
1716 vmlal.u16 q9, d8, d1[1]
1717 vmlsl.u16 q13, d5, d0[3]
1718 vmlsl.u16 q13, d7, d1[0]
1719 vmlal.u16 q13, d9, d1[1]
1720 vrev64.32 q14, q1
1721 vrev64.32 q15, q1
1722 vmlal.u16 q14, d4, d1[1]
1723 vmlsl.u16 q14, d6, d1[2]
1724 vmlsl.u16 q14, d8, d1[3]
1725 vmlal.u16 q15, d5, d1[1]
1726 vmlsl.u16 q15, d7, d1[2]
1727 vmlsl.u16 q15, d9, d1[3]
DRC7a9376c2011-08-12 19:27:20 +00001728.endm
1729
1730.macro do_rgb_to_yuv_stage2
DRCcf888482016-02-02 23:17:06 -06001731 vrshrn.u32 d20, q7, #16
1732 vrshrn.u32 d21, q8, #16
1733 vshrn.u32 d22, q9, #16
1734 vshrn.u32 d23, q13, #16
1735 vshrn.u32 d24, q14, #16
1736 vshrn.u32 d25, q15, #16
1737 vmovn.u16 d20, q10 /* d20 = y */
1738 vmovn.u16 d21, q11 /* d21 = u */
1739 vmovn.u16 d22, q12 /* d22 = v */
DRC7a9376c2011-08-12 19:27:20 +00001740.endm
1741
1742.macro do_rgb_to_yuv
1743 do_rgb_to_yuv_stage1
1744 do_rgb_to_yuv_stage2
1745.endm
1746
1747.macro do_rgb_to_yuv_stage2_store_load_stage1
DRCcf888482016-02-02 23:17:06 -06001748 vrshrn.u32 d20, q7, #16
1749 vrshrn.u32 d21, q8, #16
1750 vshrn.u32 d22, q9, #16
1751 vrev64.32 q9, q1
1752 vshrn.u32 d23, q13, #16
1753 vrev64.32 q13, q1
1754 vshrn.u32 d24, q14, #16
1755 vshrn.u32 d25, q15, #16
1756 do_load \bpp, 8
1757 vmovn.u16 d20, q10 /* d20 = y */
1758 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1759 vmovn.u16 d21, q11 /* d21 = u */
1760 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1761 vmovn.u16 d22, q12 /* d22 = v */
1762 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1763 vmull.u16 q7, d4, d0[0]
1764 vmlal.u16 q7, d6, d0[1]
1765 vmlal.u16 q7, d8, d0[2]
1766 vst1.8 {d20}, [Y]!
1767 vmull.u16 q8, d5, d0[0]
1768 vmlal.u16 q8, d7, d0[1]
1769 vmlal.u16 q8, d9, d0[2]
1770 vmlsl.u16 q9, d4, d0[3]
1771 vmlsl.u16 q9, d6, d1[0]
1772 vmlal.u16 q9, d8, d1[1]
1773 vst1.8 {d21}, [U]!
1774 vmlsl.u16 q13, d5, d0[3]
1775 vmlsl.u16 q13, d7, d1[0]
1776 vmlal.u16 q13, d9, d1[1]
1777 vrev64.32 q14, q1
1778 vrev64.32 q15, q1
1779 vmlal.u16 q14, d4, d1[1]
1780 vmlsl.u16 q14, d6, d1[2]
1781 vmlsl.u16 q14, d8, d1[3]
1782 vst1.8 {d22}, [V]!
1783 vmlal.u16 q15, d5, d1[1]
1784 vmlsl.u16 q15, d7, d1[2]
1785 vmlsl.u16 q15, d9, d1[3]
DRC7a9376c2011-08-12 19:27:20 +00001786.endm
1787
1788.balign 16
1789jsimd_\colorid\()_ycc_neon_consts:
DRCcf888482016-02-02 23:17:06 -06001790 .short 19595, 38470, 7471, 11059
1791 .short 21709, 32768, 27439, 5329
1792 .short 32767, 128, 32767, 128
1793 .short 32767, 128, 32767, 128
DRC7a9376c2011-08-12 19:27:20 +00001794
1795asm_function jsimd_\colorid\()_ycc_convert_neon
1796 OUTPUT_WIDTH .req r0
1797 INPUT_BUF .req r1
1798 OUTPUT_BUF .req r2
1799 OUTPUT_ROW .req r3
1800 NUM_ROWS .req r4
1801
1802 OUTPUT_BUF0 .req r5
1803 OUTPUT_BUF1 .req r6
1804 OUTPUT_BUF2 .req OUTPUT_BUF
1805
1806 RGB .req r7
1807 Y .req r8
1808 U .req r9
1809 V .req r10
1810 N .req ip
1811
1812 /* Load constants to d0, d1, d2, d3 */
1813 adr ip, jsimd_\colorid\()_ycc_neon_consts
1814 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1815
1816 /* Save ARM registers and handle input arguments */
1817 push {r4, r5, r6, r7, r8, r9, r10, lr}
1818 ldr NUM_ROWS, [sp, #(4 * 8)]
1819 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1820 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1821 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1822 .unreq OUTPUT_BUF
1823
1824 /* Save NEON registers */
1825 vpush {d8-d15}
1826
1827 /* Outer loop over scanlines */
1828 cmp NUM_ROWS, #1
1829 blt 9f
18300:
1831 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1832 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1833 mov N, OUTPUT_WIDTH
1834 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1835 add OUTPUT_ROW, OUTPUT_ROW, #1
1836 ldr RGB, [INPUT_BUF], #4
1837
1838 /* Inner loop over pixels */
1839 subs N, N, #8
1840 blt 3f
1841 do_load \bpp, 8
1842 do_rgb_to_yuv_stage1
1843 subs N, N, #8
1844 blt 2f
18451:
1846 do_rgb_to_yuv_stage2_store_load_stage1
1847 subs N, N, #8
1848 bge 1b
18492:
1850 do_rgb_to_yuv_stage2
1851 do_store 8
1852 tst N, #7
1853 beq 8f
18543:
1855 tst N, #4
1856 beq 3f
1857 do_load \bpp, 4
18583:
1859 tst N, #2
1860 beq 4f
1861 do_load \bpp, 2
18624:
1863 tst N, #1
1864 beq 5f
1865 do_load \bpp, 1
18665:
1867 do_rgb_to_yuv
1868 tst N, #4
1869 beq 6f
1870 do_store 4
18716:
1872 tst N, #2
1873 beq 7f
1874 do_store 2
18757:
1876 tst N, #1
1877 beq 8f
1878 do_store 1
18798:
1880 subs NUM_ROWS, NUM_ROWS, #1
1881 bgt 0b
18829:
1883 /* Restore all registers and return */
1884 vpop {d8-d15}
1885 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1886
1887 .unreq OUTPUT_WIDTH
1888 .unreq OUTPUT_ROW
1889 .unreq INPUT_BUF
1890 .unreq NUM_ROWS
1891 .unreq OUTPUT_BUF0
1892 .unreq OUTPUT_BUF1
1893 .unreq OUTPUT_BUF2
1894 .unreq RGB
1895 .unreq Y
1896 .unreq U
1897 .unreq V
1898 .unreq N
DRC7a9376c2011-08-12 19:27:20 +00001899
1900.purgem do_rgb_to_yuv
1901.purgem do_rgb_to_yuv_stage1
1902.purgem do_rgb_to_yuv_stage2
1903.purgem do_rgb_to_yuv_stage2_store_load_stage1
1904
1905.endm
1906
1907/*--------------------------------- id ----- bpp R G B */
1908generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1909generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1910generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1911generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1912generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1913generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1914
1915.purgem do_load
1916.purgem do_store
1917
DRC3e00f032014-02-05 07:40:00 +00001918
DRC7a9376c2011-08-12 19:27:20 +00001919/*****************************************************************************/
1920
1921/*
DRCb7400542011-08-10 23:31:13 +00001922 * Load data into workspace, applying unsigned->signed conversion
1923 *
1924 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1925 * rid of VST1.16 instructions
1926 */
1927
1928asm_function jsimd_convsamp_neon
1929 SAMPLE_DATA .req r0
1930 START_COL .req r1
1931 WORKSPACE .req r2
1932 TMP1 .req r3
1933 TMP2 .req r4
1934 TMP3 .req r5
1935 TMP4 .req ip
1936
1937 push {r4, r5}
1938 vmov.u8 d0, #128
1939
1940 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1941 add TMP1, TMP1, START_COL
1942 add TMP2, TMP2, START_COL
1943 add TMP3, TMP3, START_COL
1944 add TMP4, TMP4, START_COL
1945 vld1.8 {d16}, [TMP1]
1946 vsubl.u8 q8, d16, d0
1947 vld1.8 {d18}, [TMP2]
1948 vsubl.u8 q9, d18, d0
1949 vld1.8 {d20}, [TMP3]
1950 vsubl.u8 q10, d20, d0
1951 vld1.8 {d22}, [TMP4]
1952 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1953 vsubl.u8 q11, d22, d0
1954 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1955 add TMP1, TMP1, START_COL
1956 add TMP2, TMP2, START_COL
1957 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1958 add TMP3, TMP3, START_COL
1959 add TMP4, TMP4, START_COL
1960 vld1.8 {d24}, [TMP1]
1961 vsubl.u8 q12, d24, d0
1962 vld1.8 {d26}, [TMP2]
1963 vsubl.u8 q13, d26, d0
1964 vld1.8 {d28}, [TMP3]
1965 vsubl.u8 q14, d28, d0
1966 vld1.8 {d30}, [TMP4]
1967 vsubl.u8 q15, d30, d0
1968 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1969 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1970 pop {r4, r5}
1971 bx lr
1972
1973 .unreq SAMPLE_DATA
1974 .unreq START_COL
1975 .unreq WORKSPACE
1976 .unreq TMP1
1977 .unreq TMP2
1978 .unreq TMP3
1979 .unreq TMP4
DRCb7400542011-08-10 23:31:13 +00001980
DRC3e00f032014-02-05 07:40:00 +00001981
DRCb7400542011-08-10 23:31:13 +00001982/*****************************************************************************/
1983
1984/*
1985 * jsimd_fdct_ifast_neon
1986 *
1987 * This function contains a fast, not so accurate integer implementation of
1988 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1989 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1990 * function from jfdctfst.c
1991 *
1992 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1993 * rid of a bunch of VLD1.16 instructions
1994 */
1995
1996#define XFIX_0_382683433 d0[0]
1997#define XFIX_0_541196100 d0[1]
1998#define XFIX_0_707106781 d0[2]
1999#define XFIX_1_306562965 d0[3]
2000
2001.balign 16
2002jsimd_fdct_ifast_neon_consts:
DRCcf888482016-02-02 23:17:06 -06002003 .short (98 * 128) /* XFIX_0_382683433 */
2004 .short (139 * 128) /* XFIX_0_541196100 */
2005 .short (181 * 128) /* XFIX_0_707106781 */
2006 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
DRCb7400542011-08-10 23:31:13 +00002007
2008asm_function jsimd_fdct_ifast_neon
2009
2010 DATA .req r0
2011 TMP .req ip
2012
2013 vpush {d8-d15}
2014
2015 /* Load constants */
2016 adr TMP, jsimd_fdct_ifast_neon_consts
2017 vld1.16 {d0}, [TMP, :64]
2018
2019 /* Load all DATA into NEON registers with the following allocation:
2020 * 0 1 2 3 | 4 5 6 7
2021 * ---------+--------
2022 * 0 | d16 | d17 | q8
2023 * 1 | d18 | d19 | q9
2024 * 2 | d20 | d21 | q10
2025 * 3 | d22 | d23 | q11
2026 * 4 | d24 | d25 | q12
2027 * 5 | d26 | d27 | q13
2028 * 6 | d28 | d29 | q14
2029 * 7 | d30 | d31 | q15
2030 */
2031
2032 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
2033 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
2034 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
2035 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2036 sub DATA, DATA, #(128 - 32)
2037
2038 mov TMP, #2
20391:
2040 /* Transpose */
2041 vtrn.16 q12, q13
2042 vtrn.16 q10, q11
DRCcf888482016-02-02 23:17:06 -06002043 vtrn.16 q8, q9
DRCb7400542011-08-10 23:31:13 +00002044 vtrn.16 q14, q15
DRCcf888482016-02-02 23:17:06 -06002045 vtrn.32 q9, q11
DRCb7400542011-08-10 23:31:13 +00002046 vtrn.32 q13, q15
DRCcf888482016-02-02 23:17:06 -06002047 vtrn.32 q8, q10
DRCb7400542011-08-10 23:31:13 +00002048 vtrn.32 q12, q14
2049 vswp d30, d23
2050 vswp d24, d17
2051 vswp d26, d19
2052 /* 1-D FDCT */
DRCcf888482016-02-02 23:17:06 -06002053 vadd.s16 q2, q11, q12
DRCb7400542011-08-10 23:31:13 +00002054 vswp d28, d21
2055 vsub.s16 q12, q11, q12
DRCcf888482016-02-02 23:17:06 -06002056 vsub.s16 q6, q10, q13
DRCb7400542011-08-10 23:31:13 +00002057 vadd.s16 q10, q10, q13
DRCcf888482016-02-02 23:17:06 -06002058 vsub.s16 q7, q9, q14
2059 vadd.s16 q9, q9, q14
2060 vsub.s16 q1, q8, q15
2061 vadd.s16 q8, q8, q15
2062 vsub.s16 q4, q9, q10
2063 vsub.s16 q5, q8, q2
2064 vadd.s16 q3, q9, q10
2065 vadd.s16 q4, q4, q5
2066 vadd.s16 q2, q8, q2
2067 vqdmulh.s16 q4, q4, XFIX_0_707106781
DRCb7400542011-08-10 23:31:13 +00002068 vadd.s16 q11, q12, q6
DRCcf888482016-02-02 23:17:06 -06002069 vadd.s16 q8, q2, q3
2070 vsub.s16 q12, q2, q3
2071 vadd.s16 q3, q6, q7
2072 vadd.s16 q7, q7, q1
2073 vqdmulh.s16 q3, q3, XFIX_0_707106781
2074 vsub.s16 q6, q11, q7
2075 vadd.s16 q10, q5, q4
2076 vqdmulh.s16 q6, q6, XFIX_0_382683433
2077 vsub.s16 q14, q5, q4
DRCb7400542011-08-10 23:31:13 +00002078 vqdmulh.s16 q11, q11, XFIX_0_541196100
DRCcf888482016-02-02 23:17:06 -06002079 vqdmulh.s16 q5, q7, XFIX_1_306562965
2080 vadd.s16 q4, q1, q3
2081 vsub.s16 q3, q1, q3
2082 vadd.s16 q7, q7, q6
DRCb7400542011-08-10 23:31:13 +00002083 vadd.s16 q11, q11, q6
DRCcf888482016-02-02 23:17:06 -06002084 vadd.s16 q7, q7, q5
2085 vadd.s16 q13, q3, q11
2086 vsub.s16 q11, q3, q11
2087 vadd.s16 q9, q4, q7
2088 vsub.s16 q15, q4, q7
DRCb7400542011-08-10 23:31:13 +00002089 subs TMP, TMP, #1
2090 bne 1b
2091
2092 /* store results */
2093 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2094 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2095 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2096 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2097
2098 vpop {d8-d15}
2099 bx lr
2100
2101 .unreq DATA
2102 .unreq TMP
DRCb7400542011-08-10 23:31:13 +00002103
DRC3e00f032014-02-05 07:40:00 +00002104
DRCb7400542011-08-10 23:31:13 +00002105/*****************************************************************************/
DRC82bd5212011-08-17 21:00:59 +00002106
2107/*
2108 * GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -06002109 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2110 * DCTELEM *workspace);
DRC82bd5212011-08-17 21:00:59 +00002111 *
2112 * Note: the code uses 2 stage pipelining in order to improve instructions
2113 * scheduling and eliminate stalls (this provides ~15% better
2114 * performance for this function on both ARM Cortex-A8 and
2115 * ARM Cortex-A9 when compared to the non-pipelined variant).
2116 * The instructions which belong to the second stage use different
2117 * indentation for better readiability.
2118 */
2119asm_function jsimd_quantize_neon
2120
2121 COEF_BLOCK .req r0
2122 DIVISORS .req r1
2123 WORKSPACE .req r2
2124
2125 RECIPROCAL .req DIVISORS
2126 CORRECTION .req r3
2127 SHIFT .req ip
2128 LOOP_COUNT .req r4
2129
2130 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2131 vabs.s16 q12, q0
2132 add CORRECTION, DIVISORS, #(64 * 2)
2133 add SHIFT, DIVISORS, #(64 * 6)
2134 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2135 vabs.s16 q13, q1
2136 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
DRCcf888482016-02-02 23:17:06 -06002137 vadd.u16 q12, q12, q10 /* add correction */
DRC82bd5212011-08-17 21:00:59 +00002138 vadd.u16 q13, q13, q11
DRCcf888482016-02-02 23:17:06 -06002139 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
DRC82bd5212011-08-17 21:00:59 +00002140 vmull.u16 q11, d25, d17
DRCcf888482016-02-02 23:17:06 -06002141 vmull.u16 q8, d26, d18
2142 vmull.u16 q9, d27, d19
DRC82bd5212011-08-17 21:00:59 +00002143 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2144 vshrn.u32 d20, q10, #16
2145 vshrn.u32 d21, q11, #16
DRCcf888482016-02-02 23:17:06 -06002146 vshrn.u32 d22, q8, #16
2147 vshrn.u32 d23, q9, #16
DRC82bd5212011-08-17 21:00:59 +00002148 vneg.s16 q12, q12
2149 vneg.s16 q13, q13
DRCcf888482016-02-02 23:17:06 -06002150 vshr.s16 q2, q0, #15 /* extract sign */
2151 vshr.s16 q3, q1, #15
2152 vshl.u16 q14, q10, q12 /* shift */
DRC82bd5212011-08-17 21:00:59 +00002153 vshl.u16 q15, q11, q13
2154
2155 push {r4, r5}
2156 mov LOOP_COUNT, #3
21571:
2158 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2159 veor.u16 q14, q14, q2 /* restore sign */
2160 vabs.s16 q12, q0
2161 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2162 vabs.s16 q13, q1
2163 veor.u16 q15, q15, q3
2164 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
DRCcf888482016-02-02 23:17:06 -06002165 vadd.u16 q12, q12, q10 /* add correction */
DRC82bd5212011-08-17 21:00:59 +00002166 vadd.u16 q13, q13, q11
DRCcf888482016-02-02 23:17:06 -06002167 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
DRC82bd5212011-08-17 21:00:59 +00002168 vmull.u16 q11, d25, d17
DRCcf888482016-02-02 23:17:06 -06002169 vmull.u16 q8, d26, d18
2170 vmull.u16 q9, d27, d19
DRC82bd5212011-08-17 21:00:59 +00002171 vsub.u16 q14, q14, q2
2172 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2173 vsub.u16 q15, q15, q3
2174 vshrn.u32 d20, q10, #16
2175 vshrn.u32 d21, q11, #16
2176 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
DRCcf888482016-02-02 23:17:06 -06002177 vshrn.u32 d22, q8, #16
2178 vshrn.u32 d23, q9, #16
DRC82bd5212011-08-17 21:00:59 +00002179 vneg.s16 q12, q12
2180 vneg.s16 q13, q13
DRCcf888482016-02-02 23:17:06 -06002181 vshr.s16 q2, q0, #15 /* extract sign */
2182 vshr.s16 q3, q1, #15
2183 vshl.u16 q14, q10, q12 /* shift */
DRC82bd5212011-08-17 21:00:59 +00002184 vshl.u16 q15, q11, q13
2185 subs LOOP_COUNT, LOOP_COUNT, #1
2186 bne 1b
2187 pop {r4, r5}
2188
2189 veor.u16 q14, q14, q2 /* restore sign */
2190 veor.u16 q15, q15, q3
2191 vsub.u16 q14, q14, q2
2192 vsub.u16 q15, q15, q3
2193 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2194
DRCcf888482016-02-02 23:17:06 -06002195 bx lr /* return */
DRC82bd5212011-08-17 21:00:59 +00002196
2197 .unreq COEF_BLOCK
2198 .unreq DIVISORS
2199 .unreq WORKSPACE
2200 .unreq RECIPROCAL
2201 .unreq CORRECTION
2202 .unreq SHIFT
2203 .unreq LOOP_COUNT
DRC316617f2012-06-13 05:17:03 +00002204
DRC3e00f032014-02-05 07:40:00 +00002205
DRC316617f2012-06-13 05:17:03 +00002206/*****************************************************************************/
2207
2208/*
2209 * GLOBAL(void)
DRCcf888482016-02-02 23:17:06 -06002210 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2211 * JDIMENSION downsampled_width,
2212 * JSAMPARRAY input_data,
DRCbd498032016-02-19 08:53:33 -06002213 * JSAMPARRAY *output_data_ptr);
DRC316617f2012-06-13 05:17:03 +00002214 *
2215 * Note: the use of unaligned writes is the main remaining bottleneck in
2216 * this code, which can be potentially solved to get up to tens
2217 * of percents performance improvement on Cortex-A8/Cortex-A9.
2218 */
2219
2220/*
2221 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2222 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2223 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2224 * Register d28 is used for multiplication by 3. Register q15 is used
2225 * for adding +1 bias.
2226 */
DRCcf888482016-02-02 23:17:06 -06002227.macro upsample16 OUTPTR, INPTR
DRC316617f2012-06-13 05:17:03 +00002228 vld1.8 {q0}, [\INPTR]!
DRCcf888482016-02-02 23:17:06 -06002229 vmovl.u8 q8, d0
2230 vext.8 q2, q1, q0, #15
2231 vmovl.u8 q9, d1
DRC316617f2012-06-13 05:17:03 +00002232 vaddw.u8 q10, q15, d4
2233 vaddw.u8 q11, q15, d5
DRCcf888482016-02-02 23:17:06 -06002234 vmlal.u8 q8, d4, d28
2235 vmlal.u8 q9, d5, d28
2236 vmlal.u8 q10, d0, d28
2237 vmlal.u8 q11, d1, d28
2238 vmov q1, q0 /* backup source pixels to q1 */
2239 vrshrn.u16 d6, q8, #2
2240 vrshrn.u16 d7, q9, #2
2241 vshrn.u16 d8, q10, #2
2242 vshrn.u16 d9, q11, #2
DRC316617f2012-06-13 05:17:03 +00002243 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2244.endm
2245
2246/*
2247 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2248 * macro, the roles of q0 and q1 registers are reversed for even and odd
2249 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2250 * Also this unrolling allows to reorder loads and stores to compensate
2251 * multiplication latency and reduce stalls.
2252 */
DRCcf888482016-02-02 23:17:06 -06002253.macro upsample32 OUTPTR, INPTR
DRC316617f2012-06-13 05:17:03 +00002254 /* even 16 pixels group */
2255 vld1.8 {q0}, [\INPTR]!
DRCcf888482016-02-02 23:17:06 -06002256 vmovl.u8 q8, d0
2257 vext.8 q2, q1, q0, #15
2258 vmovl.u8 q9, d1
DRC316617f2012-06-13 05:17:03 +00002259 vaddw.u8 q10, q15, d4
2260 vaddw.u8 q11, q15, d5
DRCcf888482016-02-02 23:17:06 -06002261 vmlal.u8 q8, d4, d28
2262 vmlal.u8 q9, d5, d28
2263 vmlal.u8 q10, d0, d28
2264 vmlal.u8 q11, d1, d28
2265 /* odd 16 pixels group */
2266 vld1.8 {q1}, [\INPTR]!
2267 vrshrn.u16 d6, q8, #2
2268 vrshrn.u16 d7, q9, #2
2269 vshrn.u16 d8, q10, #2
2270 vshrn.u16 d9, q11, #2
2271 vmovl.u8 q8, d2
2272 vext.8 q2, q0, q1, #15
2273 vmovl.u8 q9, d3
2274 vaddw.u8 q10, q15, d4
2275 vaddw.u8 q11, q15, d5
2276 vmlal.u8 q8, d4, d28
2277 vmlal.u8 q9, d5, d28
2278 vmlal.u8 q10, d2, d28
2279 vmlal.u8 q11, d3, d28
DRC316617f2012-06-13 05:17:03 +00002280 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
DRCcf888482016-02-02 23:17:06 -06002281 vrshrn.u16 d6, q8, #2
2282 vrshrn.u16 d7, q9, #2
2283 vshrn.u16 d8, q10, #2
2284 vshrn.u16 d9, q11, #2
2285 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
DRC316617f2012-06-13 05:17:03 +00002286.endm
2287
2288/*
2289 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2290 */
2291.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2292 /* special case for the first and last pixels */
2293 sub \WIDTH, \WIDTH, #1
2294 add \OUTPTR, \OUTPTR, #1
2295 ldrb \TMP1, [\INPTR, \WIDTH]
2296 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2297 ldrb \TMP1, [\INPTR], #1
2298 strb \TMP1, [\OUTPTR, #-1]
2299 vmov.8 d3[7], \TMP1
2300
2301 subs \WIDTH, \WIDTH, #32
2302 blt 5f
23030: /* process 32 pixels per iteration */
2304 upsample32 \OUTPTR, \INPTR
2305 subs \WIDTH, \WIDTH, #32
2306 bge 0b
23075:
2308 adds \WIDTH, \WIDTH, #16
2309 blt 1f
23100: /* process 16 pixels if needed */
2311 upsample16 \OUTPTR, \INPTR
2312 subs \WIDTH, \WIDTH, #16
23131:
2314 adds \WIDTH, \WIDTH, #16
2315 beq 9f
2316
2317 /* load the remaining 1-15 pixels */
2318 add \INPTR, \INPTR, \WIDTH
2319 tst \WIDTH, #1
2320 beq 2f
2321 sub \INPTR, \INPTR, #1
2322 vld1.8 {d0[0]}, [\INPTR]
23232:
2324 tst \WIDTH, #2
2325 beq 2f
2326 vext.8 d0, d0, d0, #6
2327 sub \INPTR, \INPTR, #1
2328 vld1.8 {d0[1]}, [\INPTR]
2329 sub \INPTR, \INPTR, #1
2330 vld1.8 {d0[0]}, [\INPTR]
23312:
2332 tst \WIDTH, #4
2333 beq 2f
2334 vrev64.32 d0, d0
2335 sub \INPTR, \INPTR, #1
2336 vld1.8 {d0[3]}, [\INPTR]
2337 sub \INPTR, \INPTR, #1
2338 vld1.8 {d0[2]}, [\INPTR]
2339 sub \INPTR, \INPTR, #1
2340 vld1.8 {d0[1]}, [\INPTR]
2341 sub \INPTR, \INPTR, #1
2342 vld1.8 {d0[0]}, [\INPTR]
23432:
2344 tst \WIDTH, #8
2345 beq 2f
DRCcf888482016-02-02 23:17:06 -06002346 vmov d1, d0
DRC316617f2012-06-13 05:17:03 +00002347 sub \INPTR, \INPTR, #8
2348 vld1.8 {d0}, [\INPTR]
23492: /* upsample the remaining pixels */
DRCcf888482016-02-02 23:17:06 -06002350 vmovl.u8 q8, d0
2351 vext.8 q2, q1, q0, #15
2352 vmovl.u8 q9, d1
DRC316617f2012-06-13 05:17:03 +00002353 vaddw.u8 q10, q15, d4
2354 vaddw.u8 q11, q15, d5
DRCcf888482016-02-02 23:17:06 -06002355 vmlal.u8 q8, d4, d28
2356 vmlal.u8 q9, d5, d28
2357 vmlal.u8 q10, d0, d28
2358 vmlal.u8 q11, d1, d28
2359 vrshrn.u16 d10, q8, #2
2360 vrshrn.u16 d12, q9, #2
DRC316617f2012-06-13 05:17:03 +00002361 vshrn.u16 d11, q10, #2
2362 vshrn.u16 d13, q11, #2
2363 vzip.8 d10, d11
2364 vzip.8 d12, d13
2365 /* store the remaining pixels */
2366 tst \WIDTH, #8
2367 beq 2f
2368 vst1.8 {d10, d11}, [\OUTPTR]!
DRCcf888482016-02-02 23:17:06 -06002369 vmov q5, q6
DRC316617f2012-06-13 05:17:03 +000023702:
2371 tst \WIDTH, #4
2372 beq 2f
2373 vst1.8 {d10}, [\OUTPTR]!
DRCcf888482016-02-02 23:17:06 -06002374 vmov d10, d11
DRC316617f2012-06-13 05:17:03 +000023752:
2376 tst \WIDTH, #2
2377 beq 2f
2378 vst1.8 {d10[0]}, [\OUTPTR]!
2379 vst1.8 {d10[1]}, [\OUTPTR]!
2380 vst1.8 {d10[2]}, [\OUTPTR]!
2381 vst1.8 {d10[3]}, [\OUTPTR]!
2382 vext.8 d10, d10, d10, #4
23832:
2384 tst \WIDTH, #1
2385 beq 2f
2386 vst1.8 {d10[0]}, [\OUTPTR]!
2387 vst1.8 {d10[1]}, [\OUTPTR]!
23882:
23899:
2390.endm
2391
2392asm_function jsimd_h2v1_fancy_upsample_neon
2393
2394 MAX_V_SAMP_FACTOR .req r0
2395 DOWNSAMPLED_WIDTH .req r1
2396 INPUT_DATA .req r2
2397 OUTPUT_DATA_PTR .req r3
2398 OUTPUT_DATA .req OUTPUT_DATA_PTR
2399
2400 OUTPTR .req r4
2401 INPTR .req r5
2402 WIDTH .req ip
2403 TMP .req lr
2404
2405 push {r4, r5, r6, lr}
2406 vpush {d8-d15}
2407
2408 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2409 cmp MAX_V_SAMP_FACTOR, #0
2410 ble 99f
2411
2412 /* initialize constants */
2413 vmov.u8 d28, #3
2414 vmov.u16 q15, #1
241511:
2416 ldr INPTR, [INPUT_DATA], #4
2417 ldr OUTPTR, [OUTPUT_DATA], #4
2418 mov WIDTH, DOWNSAMPLED_WIDTH
2419 upsample_row OUTPTR, INPTR, WIDTH, TMP
2420 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2421 bgt 11b
2422
242399:
2424 vpop {d8-d15}
2425 pop {r4, r5, r6, pc}
2426
2427 .unreq MAX_V_SAMP_FACTOR
2428 .unreq DOWNSAMPLED_WIDTH
2429 .unreq INPUT_DATA
2430 .unreq OUTPUT_DATA_PTR
2431 .unreq OUTPUT_DATA
2432
2433 .unreq OUTPTR
2434 .unreq INPTR
2435 .unreq WIDTH
2436 .unreq TMP
2437
DRC316617f2012-06-13 05:17:03 +00002438.purgem upsample16
2439.purgem upsample32
2440.purgem upsample_row
DRC499c4702016-01-13 03:13:20 -06002441
DRCcf888482016-02-02 23:17:06 -06002442
DRC499c4702016-01-13 03:13:20 -06002443/*****************************************************************************/
2444
2445/*
2446 * GLOBAL(JOCTET*)
DRCbd498032016-02-19 08:53:33 -06002447 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
DRC15aaa7f2016-02-07 17:39:33 -06002448 * JCOEFPTR block, int last_dc_val,
2449 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
DRC499c4702016-01-13 03:13:20 -06002450 *
2451 */
2452
2453.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
DRCcf888482016-02-02 23:17:06 -06002454 sub \PUT_BITS, \PUT_BITS, #0x8
2455 lsr \TMP, \PUT_BUFFER, \PUT_BITS
2456 uxtb \TMP, \TMP
2457 strb \TMP, [\BUFFER, #1]!
2458 cmp \TMP, #0xff
DRC499c4702016-01-13 03:13:20 -06002459 /*it eq*/
DRCcf888482016-02-02 23:17:06 -06002460 streqb \ZERO, [\BUFFER, #1]!
DRC499c4702016-01-13 03:13:20 -06002461.endm
DRCcf888482016-02-02 23:17:06 -06002462
DRC499c4702016-01-13 03:13:20 -06002463.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
DRCcf888482016-02-02 23:17:06 -06002464 /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2465 add \PUT_BITS, \SIZE
2466 /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2467 orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
DRC499c4702016-01-13 03:13:20 -06002468.endm
DRCcf888482016-02-02 23:17:06 -06002469
DRC499c4702016-01-13 03:13:20 -06002470.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
DRCcf888482016-02-02 23:17:06 -06002471 cmp \PUT_BITS, #0x10
2472 blt 15f
2473 eor \ZERO, \ZERO, \ZERO
2474 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2475 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
DRC499c4702016-01-13 03:13:20 -0600247615:
2477.endm
2478
2479.balign 16
2480jsimd_huff_encode_one_block_neon_consts:
DRCcf888482016-02-02 23:17:06 -06002481 .byte 0x01
2482 .byte 0x02
2483 .byte 0x04
2484 .byte 0x08
2485 .byte 0x10
2486 .byte 0x20
2487 .byte 0x40
2488 .byte 0x80
DRC499c4702016-01-13 03:13:20 -06002489
2490asm_function jsimd_huff_encode_one_block_neon
DRCcf888482016-02-02 23:17:06 -06002491 push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2492 add r7, sp, #0x1c
2493 sub r4, sp, #0x40
2494 bfc r4, #0, #5
2495 mov sp, r4 /* align sp on 32 bytes */
2496 vst1.64 {d8, d9, d10, d11}, [r4, :128]!
2497 vst1.64 {d12, d13, d14, d15}, [r4, :128]
2498 sub sp, #0x140 /* reserve 320 bytes */
2499 str r0, [sp, #0x18] /* working state > sp + Ox18 */
2500 add r4, sp, #0x20 /* r4 = t1 */
2501 ldr lr, [r7, #0x8] /* lr = dctbl */
2502 sub r10, r1, #0x1 /* r10=buffer-- */
2503 ldrsh r1, [r2]
2504 mov r9, #0x10
2505 mov r8, #0x1
2506 adr r5, jsimd_huff_encode_one_block_neon_consts
DRC499c4702016-01-13 03:13:20 -06002507 /* prepare data */
DRCcf888482016-02-02 23:17:06 -06002508 vld1.8 {d26}, [r5, :64]
2509 veor q8, q8, q8
2510 veor q9, q9, q9
2511 vdup.16 q14, r9
2512 vdup.16 q15, r8
2513 veor q10, q10, q10
2514 veor q11, q11, q11
2515 sub r1, r1, r3
2516 add r9, r2, #0x22
2517 add r8, r2, #0x18
2518 add r3, r2, #0x36
2519 vmov.16 d0[0], r1
2520 vld1.16 {d2[0]}, [r9, :16]
2521 vld1.16 {d4[0]}, [r8, :16]
2522 vld1.16 {d6[0]}, [r3, :16]
2523 add r1, r2, #0x2
2524 add r9, r2, #0x30
2525 add r8, r2, #0x26
2526 add r3, r2, #0x28
2527 vld1.16 {d0[1]}, [r1, :16]
2528 vld1.16 {d2[1]}, [r9, :16]
2529 vld1.16 {d4[1]}, [r8, :16]
2530 vld1.16 {d6[1]}, [r3, :16]
2531 add r1, r2, #0x10
2532 add r9, r2, #0x40
2533 add r8, r2, #0x34
2534 add r3, r2, #0x1a
2535 vld1.16 {d0[2]}, [r1, :16]
2536 vld1.16 {d2[2]}, [r9, :16]
2537 vld1.16 {d4[2]}, [r8, :16]
2538 vld1.16 {d6[2]}, [r3, :16]
2539 add r1, r2, #0x20
2540 add r9, r2, #0x32
2541 add r8, r2, #0x42
2542 add r3, r2, #0xc
2543 vld1.16 {d0[3]}, [r1, :16]
2544 vld1.16 {d2[3]}, [r9, :16]
2545 vld1.16 {d4[3]}, [r8, :16]
2546 vld1.16 {d6[3]}, [r3, :16]
2547 add r1, r2, #0x12
2548 add r9, r2, #0x24
2549 add r8, r2, #0x50
2550 add r3, r2, #0xe
2551 vld1.16 {d1[0]}, [r1, :16]
2552 vld1.16 {d3[0]}, [r9, :16]
2553 vld1.16 {d5[0]}, [r8, :16]
2554 vld1.16 {d7[0]}, [r3, :16]
2555 add r1, r2, #0x4
2556 add r9, r2, #0x16
2557 add r8, r2, #0x60
2558 add r3, r2, #0x1c
2559 vld1.16 {d1[1]}, [r1, :16]
2560 vld1.16 {d3[1]}, [r9, :16]
2561 vld1.16 {d5[1]}, [r8, :16]
2562 vld1.16 {d7[1]}, [r3, :16]
2563 add r1, r2, #0x6
2564 add r9, r2, #0x8
2565 add r8, r2, #0x52
2566 add r3, r2, #0x2a
2567 vld1.16 {d1[2]}, [r1, :16]
2568 vld1.16 {d3[2]}, [r9, :16]
2569 vld1.16 {d5[2]}, [r8, :16]
2570 vld1.16 {d7[2]}, [r3, :16]
2571 add r1, r2, #0x14
2572 add r9, r2, #0xa
2573 add r8, r2, #0x44
2574 add r3, r2, #0x38
2575 vld1.16 {d1[3]}, [r1, :16]
2576 vld1.16 {d3[3]}, [r9, :16]
2577 vld1.16 {d5[3]}, [r8, :16]
2578 vld1.16 {d7[3]}, [r3, :16]
2579 vcgt.s16 q8, q8, q0
2580 vcgt.s16 q9, q9, q1
2581 vcgt.s16 q10, q10, q2
2582 vcgt.s16 q11, q11, q3
2583 vabs.s16 q0, q0
2584 vabs.s16 q1, q1
2585 vabs.s16 q2, q2
2586 vabs.s16 q3, q3
2587 veor q8, q8, q0
2588 veor q9, q9, q1
2589 veor q10, q10, q2
2590 veor q11, q11, q3
2591 add r9, r4, #0x20
2592 add r8, r4, #0x80
2593 add r3, r4, #0xa0
2594 vclz.i16 q0, q0
2595 vclz.i16 q1, q1
2596 vclz.i16 q2, q2
2597 vclz.i16 q3, q3
2598 vsub.i16 q0, q14, q0
2599 vsub.i16 q1, q14, q1
2600 vsub.i16 q2, q14, q2
2601 vsub.i16 q3, q14, q3
2602 vst1.16 {d0, d1, d2, d3}, [r4, :256]
2603 vst1.16 {d4, d5, d6, d7}, [r9, :256]
2604 vshl.s16 q0, q15, q0
2605 vshl.s16 q1, q15, q1
2606 vshl.s16 q2, q15, q2
2607 vshl.s16 q3, q15, q3
2608 vsub.i16 q0, q0, q15
2609 vsub.i16 q1, q1, q15
2610 vsub.i16 q2, q2, q15
2611 vsub.i16 q3, q3, q15
2612 vand q8, q8, q0
2613 vand q9, q9, q1
2614 vand q10, q10, q2
2615 vand q11, q11, q3
2616 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2617 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2618 add r1, r2, #0x46
2619 add r9, r2, #0x3a
2620 add r8, r2, #0x74
2621 add r3, r2, #0x6a
2622 vld1.16 {d8[0]}, [r1, :16]
2623 vld1.16 {d10[0]}, [r9, :16]
2624 vld1.16 {d12[0]}, [r8, :16]
2625 vld1.16 {d14[0]}, [r3, :16]
2626 veor q8, q8, q8
2627 veor q9, q9, q9
2628 veor q10, q10, q10
2629 veor q11, q11, q11
2630 add r1, r2, #0x54
2631 add r9, r2, #0x2c
2632 add r8, r2, #0x76
2633 add r3, r2, #0x78
2634 vld1.16 {d8[1]}, [r1, :16]
2635 vld1.16 {d10[1]}, [r9, :16]
2636 vld1.16 {d12[1]}, [r8, :16]
2637 vld1.16 {d14[1]}, [r3, :16]
2638 add r1, r2, #0x62
2639 add r9, r2, #0x1e
2640 add r8, r2, #0x68
2641 add r3, r2, #0x7a
2642 vld1.16 {d8[2]}, [r1, :16]
2643 vld1.16 {d10[2]}, [r9, :16]
2644 vld1.16 {d12[2]}, [r8, :16]
2645 vld1.16 {d14[2]}, [r3, :16]
2646 add r1, r2, #0x70
2647 add r9, r2, #0x2e
2648 add r8, r2, #0x5a
2649 add r3, r2, #0x6c
2650 vld1.16 {d8[3]}, [r1, :16]
2651 vld1.16 {d10[3]}, [r9, :16]
2652 vld1.16 {d12[3]}, [r8, :16]
2653 vld1.16 {d14[3]}, [r3, :16]
2654 add r1, r2, #0x72
2655 add r9, r2, #0x3c
2656 add r8, r2, #0x4c
2657 add r3, r2, #0x5e
2658 vld1.16 {d9[0]}, [r1, :16]
2659 vld1.16 {d11[0]}, [r9, :16]
2660 vld1.16 {d13[0]}, [r8, :16]
2661 vld1.16 {d15[0]}, [r3, :16]
2662 add r1, r2, #0x64
2663 add r9, r2, #0x4a
2664 add r8, r2, #0x3e
2665 add r3, r2, #0x6e
2666 vld1.16 {d9[1]}, [r1, :16]
2667 vld1.16 {d11[1]}, [r9, :16]
2668 vld1.16 {d13[1]}, [r8, :16]
2669 vld1.16 {d15[1]}, [r3, :16]
2670 add r1, r2, #0x56
2671 add r9, r2, #0x58
2672 add r8, r2, #0x4e
2673 add r3, r2, #0x7c
2674 vld1.16 {d9[2]}, [r1, :16]
2675 vld1.16 {d11[2]}, [r9, :16]
2676 vld1.16 {d13[2]}, [r8, :16]
2677 vld1.16 {d15[2]}, [r3, :16]
2678 add r1, r2, #0x48
2679 add r9, r2, #0x66
2680 add r8, r2, #0x5c
2681 add r3, r2, #0x7e
2682 vld1.16 {d9[3]}, [r1, :16]
2683 vld1.16 {d11[3]}, [r9, :16]
2684 vld1.16 {d13[3]}, [r8, :16]
2685 vld1.16 {d15[3]}, [r3, :16]
2686 vcgt.s16 q8, q8, q4
2687 vcgt.s16 q9, q9, q5
2688 vcgt.s16 q10, q10, q6
2689 vcgt.s16 q11, q11, q7
2690 vabs.s16 q4, q4
2691 vabs.s16 q5, q5
2692 vabs.s16 q6, q6
2693 vabs.s16 q7, q7
2694 veor q8, q8, q4
2695 veor q9, q9, q5
2696 veor q10, q10, q6
2697 veor q11, q11, q7
2698 add r1, r4, #0x40
2699 add r9, r4, #0x60
2700 add r8, r4, #0xc0
2701 add r3, r4, #0xe0
2702 vclz.i16 q4, q4
2703 vclz.i16 q5, q5
2704 vclz.i16 q6, q6
2705 vclz.i16 q7, q7
2706 vsub.i16 q4, q14, q4
2707 vsub.i16 q5, q14, q5
2708 vsub.i16 q6, q14, q6
2709 vsub.i16 q7, q14, q7
2710 vst1.16 {d8, d9, d10, d11}, [r1, :256]
2711 vst1.16 {d12, d13, d14, d15}, [r9, :256]
2712 vshl.s16 q4, q15, q4
2713 vshl.s16 q5, q15, q5
2714 vshl.s16 q6, q15, q6
2715 vshl.s16 q7, q15, q7
2716 vsub.i16 q4, q4, q15
2717 vsub.i16 q5, q5, q15
2718 vsub.i16 q6, q6, q15
2719 vsub.i16 q7, q7, q15
2720 vand q8, q8, q4
2721 vand q9, q9, q5
2722 vand q10, q10, q6
2723 vand q11, q11, q7
2724 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2725 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2726 ldr r12, [r7, #0xc] /* r12 = actbl */
2727 add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
2728 mov r9, r12 /* r9 = actbl */
2729 add r6, r4, #0x80 /* r6 = t2 */
2730 ldr r11, [r0, #0x8] /* r11 = put_buffer */
2731 ldr r4, [r0, #0xc] /* r4 = put_bits */
2732 ldrh r2, [r6, #-128] /* r2 = nbits */
2733 ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */
2734 ldr r0, [lr, r2, lsl #2]
2735 ldrb r5, [r1, r2]
2736 put_bits r11, r4, r0, r5
2737 checkbuf15 r10, r11, r4, r5, r0
2738 put_bits r11, r4, r3, r2
2739 checkbuf15 r10, r11, r4, r5, r0
2740 mov lr, r6 /* lr = t2 */
2741 add r5, r9, #0x400 /* r5 = actbl->ehufsi */
2742 ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
2743 veor q8, q8, q8
2744 vceq.i16 q0, q0, q8
2745 vceq.i16 q1, q1, q8
2746 vceq.i16 q2, q2, q8
2747 vceq.i16 q3, q3, q8
2748 vceq.i16 q4, q4, q8
2749 vceq.i16 q5, q5, q8
2750 vceq.i16 q6, q6, q8
2751 vceq.i16 q7, q7, q8
2752 vmovn.i16 d0, q0
2753 vmovn.i16 d2, q1
2754 vmovn.i16 d4, q2
2755 vmovn.i16 d6, q3
2756 vmovn.i16 d8, q4
2757 vmovn.i16 d10, q5
2758 vmovn.i16 d12, q6
2759 vmovn.i16 d14, q7
2760 vand d0, d0, d26
2761 vand d2, d2, d26
2762 vand d4, d4, d26
2763 vand d6, d6, d26
2764 vand d8, d8, d26
2765 vand d10, d10, d26
2766 vand d12, d12, d26
2767 vand d14, d14, d26
2768 vpadd.i8 d0, d0, d2
2769 vpadd.i8 d4, d4, d6
2770 vpadd.i8 d8, d8, d10
2771 vpadd.i8 d12, d12, d14
2772 vpadd.i8 d0, d0, d4
2773 vpadd.i8 d8, d8, d12
2774 vpadd.i8 d0, d0, d8
2775 vmov.32 r1, d0[1]
2776 vmov.32 r8, d0[0]
2777 mvn r1, r1
2778 mvn r8, r8
2779 lsrs r1, r1, #0x1
2780 rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
2781 rbit r1, r1 /* r1 = index1 */
2782 rbit r8, r8 /* r8 = index0 */
2783 ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
2784 str r1, [sp, #0x14] /* index1 > sp + 0x14 */
2785 cmp r8, #0x0
2786 beq 6f
DRC499c4702016-01-13 03:13:20 -060027871:
DRCcf888482016-02-02 23:17:06 -06002788 clz r2, r8
2789 add lr, lr, r2, lsl #1
2790 lsl r8, r8, r2
2791 ldrh r1, [lr, #-126]
DRC499c4702016-01-13 03:13:20 -060027922:
DRCcf888482016-02-02 23:17:06 -06002793 cmp r2, #0x10
2794 blt 3f
2795 sub r2, r2, #0x10
2796 put_bits r11, r4, r0, r6
2797 cmp r4, #0x10
2798 blt 2b
2799 eor r3, r3, r3
2800 emit_byte r10, r11, r4, r3, r12
2801 emit_byte r10, r11, r4, r3, r12
2802 b 2b
DRC499c4702016-01-13 03:13:20 -060028033:
DRCcf888482016-02-02 23:17:06 -06002804 add r2, r1, r2, lsl #4
2805 ldrh r3, [lr, #2]!
2806 ldr r12, [r9, r2, lsl #2]
2807 ldrb r2, [r5, r2]
2808 put_bits r11, r4, r12, r2
2809 checkbuf15 r10, r11, r4, r2, r12
2810 put_bits r11, r4, r3, r1
2811 checkbuf15 r10, r11, r4, r2, r12
2812 lsls r8, r8, #0x1
2813 bne 1b
DRC499c4702016-01-13 03:13:20 -060028146:
DRCcf888482016-02-02 23:17:06 -06002815 add r12, sp, #0x20 /* r12 = t1 */
2816 ldr r8, [sp, #0x14] /* r8 = index1 */
2817 adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
2818 cmp r8, #0x0
2819 beq 6f
2820 clz r2, r8
2821 sub r12, r12, lr
2822 lsl r8, r8, r2
2823 add r2, r2, r12, lsr #1
2824 add lr, lr, r2, lsl #1
2825 b 7f
DRC499c4702016-01-13 03:13:20 -060028261:
DRCcf888482016-02-02 23:17:06 -06002827 clz r2, r8
2828 add lr, lr, r2, lsl #1
2829 lsl r8, r8, r2
DRC499c4702016-01-13 03:13:20 -060028307:
DRCcf888482016-02-02 23:17:06 -06002831 ldrh r1, [lr, #-126]
DRC499c4702016-01-13 03:13:20 -060028322:
DRCcf888482016-02-02 23:17:06 -06002833 cmp r2, #0x10
2834 blt 3f
2835 sub r2, r2, #0x10
2836 put_bits r11, r4, r0, r6
2837 cmp r4, #0x10
2838 blt 2b
2839 eor r3, r3, r3
2840 emit_byte r10, r11, r4, r3, r12
2841 emit_byte r10, r11, r4, r3, r12
2842 b 2b
DRC499c4702016-01-13 03:13:20 -060028433:
DRCcf888482016-02-02 23:17:06 -06002844 add r2, r1, r2, lsl #4
2845 ldrh r3, [lr, #2]!
2846 ldr r12, [r9, r2, lsl #2]
2847 ldrb r2, [r5, r2]
2848 put_bits r11, r4, r12, r2
2849 checkbuf15 r10, r11, r4, r2, r12
2850 put_bits r11, r4, r3, r1
2851 checkbuf15 r10, r11, r4, r2, r12
2852 lsls r8, r8, #0x1
2853 bne 1b
DRC499c4702016-01-13 03:13:20 -060028546:
DRCcf888482016-02-02 23:17:06 -06002855 add r0, sp, #0x20
2856 add r0, #0xfe
2857 cmp lr, r0
2858 bhs 1f
2859 ldr r1, [r9]
2860 ldrb r0, [r5]
2861 put_bits r11, r4, r1, r0
2862 checkbuf15 r10, r11, r4, r0, r1
DRC499c4702016-01-13 03:13:20 -060028631:
DRCcf888482016-02-02 23:17:06 -06002864 ldr r12, [sp, #0x18]
2865 str r11, [r12, #0x8]
2866 str r4, [r12, #0xc]
2867 add r0, r10, #0x1
2868 add r4, sp, #0x140
2869 vld1.64 {d8, d9, d10, d11}, [r4, :128]!
2870 vld1.64 {d12, d13, d14, d15}, [r4, :128]
2871 sub r4, r7, #0x1c
2872 mov sp, r4
2873 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
DRC499c4702016-01-13 03:13:20 -06002874
2875.purgem emit_byte
2876.purgem put_bits
2877.purgem checkbuf15