blob: 568768f7b8a8415e11234a8196e8a66e6ad92fdd [file] [log] [blame]
hbono@chromium.org98626972011-08-03 03:13:08 +00001/*
noel@chromium.org3395bcc2014-04-14 06:56:00 +00002 * ARMv7 NEON optimizations for libjpeg-turbo
hbono@chromium.org98626972011-08-03 03:13:08 +00003 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00006 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Tom Hudson0d47d2d2016-05-04 13:22:56 -04007 * Copyright (C) 2014 Siarhei Siamashka. All Rights Reserved.
Aaron Gablefeec46f2015-08-06 09:54:48 -07008 * Copyright (C) 2014 Linaro Limited. All Rights Reserved.
Tom Hudson0d47d2d2016-05-04 13:22:56 -04009 * Copyright (C) 2015 D. R. Commander. All Rights Reserved.
10 * Copyright (C) 2015-2016 Matthieu Darbois. All Rights Reserved.
hbono@chromium.org98626972011-08-03 03:13:08 +000011 *
12 * This software is provided 'as-is', without any express or implied
13 * warranty. In no event will the authors be held liable for any damages
14 * arising from the use of this software.
15 *
16 * Permission is granted to anyone to use this software for any purpose,
17 * including commercial applications, and to alter it and redistribute it
18 * freely, subject to the following restrictions:
19 *
20 * 1. The origin of this software must not be misrepresented; you must not
21 * claim that you wrote the original software. If you use this software
22 * in a product, an acknowledgment in the product documentation would be
23 * appreciated but is not required.
24 * 2. Altered source versions must be plainly marked as such, and must not be
25 * misrepresented as being the original software.
26 * 3. This notice may not be removed or altered from any source distribution.
27 */
28
29#if defined(__linux__) && defined(__ELF__)
Tom Hudson0d47d2d2016-05-04 13:22:56 -040030.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
hbono@chromium.org98626972011-08-03 03:13:08 +000031#endif
32
33.text
34.fpu neon
35.arch armv7a
36.object_arch armv4
37.arm
Tom Hudson0d47d2d2016-05-04 13:22:56 -040038.syntax unified
hbono@chromium.org98626972011-08-03 03:13:08 +000039
40
41#define RESPECT_STRICT_ALIGNMENT 1
42
noel@chromium.org3395bcc2014-04-14 06:56:00 +000043
hbono@chromium.org98626972011-08-03 03:13:08 +000044/*****************************************************************************/
45
46/* Supplementary macro for setting function attributes */
47.macro asm_function fname
48#ifdef __APPLE__
hbono@chromium.org98626972011-08-03 03:13:08 +000049 .globl _\fname
50_\fname:
51#else
hbono@chromium.org98626972011-08-03 03:13:08 +000052 .global \fname
53#ifdef __ELF__
54 .hidden \fname
55 .type \fname, %function
56#endif
57\fname:
58#endif
59.endm
60
61/* Transpose a block of 4x4 coefficients in four 64-bit registers */
62.macro transpose_4x4 x0, x1, x2, x3
Tom Hudson0d47d2d2016-05-04 13:22:56 -040063 vtrn.16 \x0, \x1
64 vtrn.16 \x2, \x3
65 vtrn.32 \x0, \x2
66 vtrn.32 \x1, \x3
hbono@chromium.org98626972011-08-03 03:13:08 +000067.endm
68
noel@chromium.org3395bcc2014-04-14 06:56:00 +000069
hbono@chromium.orgc6beb742011-11-29 05:16:26 +000070#define CENTERJSAMPLE 128
71
72/*****************************************************************************/
73
74/*
75 * Perform dequantization and inverse DCT on one block of coefficients.
76 *
77 * GLOBAL(void)
Tom Hudson0d47d2d2016-05-04 13:22:56 -040078 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
hbono@chromium.orgc6beb742011-11-29 05:16:26 +000079 * JSAMPARRAY output_buf, JDIMENSION output_col)
80 */
81
Tom Hudson0d47d2d2016-05-04 13:22:56 -040082#define FIX_0_298631336 (2446)
83#define FIX_0_390180644 (3196)
84#define FIX_0_541196100 (4433)
85#define FIX_0_765366865 (6270)
86#define FIX_0_899976223 (7373)
87#define FIX_1_175875602 (9633)
88#define FIX_1_501321110 (12299)
89#define FIX_1_847759065 (15137)
90#define FIX_1_961570560 (16069)
91#define FIX_2_053119869 (16819)
92#define FIX_2_562915447 (20995)
93#define FIX_3_072711026 (25172)
hbono@chromium.orgc6beb742011-11-29 05:16:26 +000094
95#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
96#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
97#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
98#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
99#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
100#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
101#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
102#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
103
104/*
105 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
106 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
107 */
108#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
109{ \
110 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400111 JLONG q1, q2, q3, q4, q5, q6, q7; \
112 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000113 \
114 /* 1-D iDCT input data */ \
115 row0 = xrow0; \
116 row1 = xrow1; \
117 row2 = xrow2; \
118 row3 = xrow3; \
119 row4 = xrow4; \
120 row5 = xrow5; \
121 row6 = xrow6; \
122 row7 = xrow7; \
123 \
124 q5 = row7 + row3; \
125 q4 = row5 + row1; \
126 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
127 MULTIPLY(q4, FIX_1_175875602); \
128 q7 = MULTIPLY(q5, FIX_1_175875602) + \
129 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
130 q2 = MULTIPLY(row2, FIX_0_541196100) + \
131 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
132 q4 = q6; \
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400133 q3 = ((JLONG) row0 - (JLONG) row4) << 13; \
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000134 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
135 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
136 /* now we can use q1 (reloadable constants have been used up) */ \
137 q1 = q3 + q2; \
138 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
139 MULTIPLY(row1, -FIX_0_899976223); \
140 q5 = q7; \
141 q1 = q1 + q6; \
142 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
143 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
144 \
145 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
146 tmp11_plus_tmp2 = q1; \
147 row1 = 0; \
148 \
149 q1 = q1 - q6; \
150 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
151 MULTIPLY(row3, -FIX_2_562915447); \
152 q1 = q1 - q6; \
153 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
154 MULTIPLY(row6, FIX_0_541196100); \
155 q3 = q3 - q2; \
156 \
157 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
158 tmp11_minus_tmp2 = q1; \
159 \
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400160 q1 = ((JLONG) row0 + (JLONG) row4) << 13; \
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000161 q2 = q1 + q6; \
162 q1 = q1 - q6; \
163 \
164 /* pick up the results */ \
165 tmp0 = q4; \
166 tmp1 = q5; \
167 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
168 tmp3 = q7; \
169 tmp10 = q2; \
170 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
171 tmp12 = q3; \
172 tmp13 = q1; \
173}
174
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400175#define XFIX_0_899976223 d0[0]
176#define XFIX_0_541196100 d0[1]
177#define XFIX_2_562915447 d0[2]
178#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
179#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
180#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
181#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
182#define XFIX_1_175875602 d1[3]
183#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
184#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
185#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
186#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000187
188.balign 16
189jsimd_idct_islow_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400190 .short FIX_0_899976223 /* d0[0] */
191 .short FIX_0_541196100 /* d0[1] */
192 .short FIX_2_562915447 /* d0[2] */
193 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
194 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
195 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
196 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
197 .short FIX_1_175875602 /* d1[3] */
198 /* reloadable constants */
199 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
200 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
201 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
202 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000203
204asm_function jsimd_idct_islow_neon
205
206 DCT_TABLE .req r0
207 COEF_BLOCK .req r1
208 OUTPUT_BUF .req r2
209 OUTPUT_COL .req r3
210 TMP1 .req r0
211 TMP2 .req r1
212 TMP3 .req r2
213 TMP4 .req ip
214
215 ROW0L .req d16
216 ROW0R .req d17
217 ROW1L .req d18
218 ROW1R .req d19
219 ROW2L .req d20
220 ROW2R .req d21
221 ROW3L .req d22
222 ROW3R .req d23
223 ROW4L .req d24
224 ROW4R .req d25
225 ROW5L .req d26
226 ROW5R .req d27
227 ROW6L .req d28
228 ROW6R .req d29
229 ROW7L .req d30
230 ROW7R .req d31
231
232 /* Load and dequantize coefficients into NEON registers
233 * with the following allocation:
234 * 0 1 2 3 | 4 5 6 7
235 * ---------+--------
236 * 0 | d16 | d17 ( q8 )
237 * 1 | d18 | d19 ( q9 )
238 * 2 | d20 | d21 ( q10 )
239 * 3 | d22 | d23 ( q11 )
240 * 4 | d24 | d25 ( q12 )
241 * 5 | d26 | d27 ( q13 )
242 * 6 | d28 | d29 ( q14 )
243 * 7 | d30 | d31 ( q15 )
244 */
245 adr ip, jsimd_idct_islow_neon_consts
246 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
247 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
248 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
249 vmul.s16 q8, q8, q0
250 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
251 vmul.s16 q9, q9, q1
252 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
253 vmul.s16 q10, q10, q2
254 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
255 vmul.s16 q11, q11, q3
256 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
257 vmul.s16 q12, q12, q0
258 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
259 vmul.s16 q14, q14, q2
260 vmul.s16 q13, q13, q1
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400261 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000262 add ip, ip, #16
263 vmul.s16 q15, q15, q3
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400264 vpush {d8-d15} /* save NEON registers */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000265 /* 1-D IDCT, pass 1, left 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400266 vadd.s16 d4, ROW7L, ROW3L
267 vadd.s16 d5, ROW5L, ROW1L
268 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
269 vmlal.s16 q6, d5, XFIX_1_175875602
270 vmull.s16 q7, d4, XFIX_1_175875602
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000271 /* Check for the zero coefficients in the right 4x8 half */
272 push {r4, r5}
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400273 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
274 vsubl.s16 q3, ROW0L, ROW4L
275 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
276 vmull.s16 q2, ROW2L, XFIX_0_541196100
277 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
278 orr r0, r4, r5
279 vmov q4, q6
280 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
281 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
282 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
283 vshl.s32 q3, q3, #13
284 orr r0, r0, r4
285 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
286 orr r0, r0, r5
287 vadd.s32 q1, q3, q2
288 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
289 vmov q5, q7
290 vadd.s32 q1, q1, q6
291 orr r0, r0, r4
292 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
293 orr r0, r0, r5
294 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
295 vrshrn.s32 ROW1L, q1, #11
296 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
297 vsub.s32 q1, q1, q6
298 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
299 orr r0, r0, r4
300 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
301 orr r0, r0, r5
302 vsub.s32 q1, q1, q6
303 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
304 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
305 vmlal.s16 q6, ROW6L, XFIX_0_541196100
306 vsub.s32 q3, q3, q2
307 orr r0, r0, r4
308 vrshrn.s32 ROW6L, q1, #11
309 orr r0, r0, r5
310 vadd.s32 q1, q3, q5
311 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
312 vsub.s32 q3, q3, q5
313 vaddl.s16 q5, ROW0L, ROW4L
314 orr r0, r0, r4
315 vrshrn.s32 ROW2L, q1, #11
316 orr r0, r0, r5
317 vrshrn.s32 ROW5L, q3, #11
318 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
319 vshl.s32 q5, q5, #13
320 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
321 orr r0, r0, r4
322 vadd.s32 q2, q5, q6
323 orrs r0, r0, r5
324 vsub.s32 q1, q5, q6
325 vadd.s32 q6, q2, q7
326 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
327 vsub.s32 q2, q2, q7
328 vadd.s32 q5, q1, q4
329 orr r0, r4, r5
330 vsub.s32 q3, q1, q4
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000331 pop {r4, r5}
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400332 vrshrn.s32 ROW7L, q2, #11
333 vrshrn.s32 ROW3L, q5, #11
334 vrshrn.s32 ROW0L, q6, #11
335 vrshrn.s32 ROW4L, q3, #11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000336
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400337 beq 3f /* Go to do some special handling for the sparse
338 right 4x8 half */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000339
340 /* 1-D IDCT, pass 1, right 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400341 vld1.s16 {d2}, [ip, :64] /* reload constants */
342 vadd.s16 d10, ROW7R, ROW3R
343 vadd.s16 d8, ROW5R, ROW1R
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000344 /* Transpose left 4x8 half */
345 vtrn.16 ROW6L, ROW7L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400346 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
347 vmlal.s16 q6, d8, XFIX_1_175875602
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000348 vtrn.16 ROW2L, ROW3L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400349 vmull.s16 q7, d10, XFIX_1_175875602
350 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000351 vtrn.16 ROW0L, ROW1L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400352 vsubl.s16 q3, ROW0R, ROW4R
353 vmull.s16 q2, ROW2R, XFIX_0_541196100
354 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000355 vtrn.16 ROW4L, ROW5L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400356 vmov q4, q6
357 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
358 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000359 vtrn.32 ROW1L, ROW3L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400360 vshl.s32 q3, q3, #13
361 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000362 vtrn.32 ROW4L, ROW6L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400363 vadd.s32 q1, q3, q2
364 vmov q5, q7
365 vadd.s32 q1, q1, q6
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000366 vtrn.32 ROW0L, ROW2L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400367 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
368 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
369 vrshrn.s32 ROW1R, q1, #11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000370 vtrn.32 ROW5L, ROW7L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400371 vsub.s32 q1, q1, q6
372 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
373 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
374 vsub.s32 q1, q1, q6
375 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
376 vmlal.s16 q6, ROW6R, XFIX_0_541196100
377 vsub.s32 q3, q3, q2
378 vrshrn.s32 ROW6R, q1, #11
379 vadd.s32 q1, q3, q5
380 vsub.s32 q3, q3, q5
381 vaddl.s16 q5, ROW0R, ROW4R
382 vrshrn.s32 ROW2R, q1, #11
383 vrshrn.s32 ROW5R, q3, #11
384 vshl.s32 q5, q5, #13
385 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
386 vadd.s32 q2, q5, q6
387 vsub.s32 q1, q5, q6
388 vadd.s32 q6, q2, q7
389 vsub.s32 q2, q2, q7
390 vadd.s32 q5, q1, q4
391 vsub.s32 q3, q1, q4
392 vrshrn.s32 ROW7R, q2, #11
393 vrshrn.s32 ROW3R, q5, #11
394 vrshrn.s32 ROW0R, q6, #11
395 vrshrn.s32 ROW4R, q3, #11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000396 /* Transpose right 4x8 half */
397 vtrn.16 ROW6R, ROW7R
398 vtrn.16 ROW2R, ROW3R
399 vtrn.16 ROW0R, ROW1R
400 vtrn.16 ROW4R, ROW5R
401 vtrn.32 ROW1R, ROW3R
402 vtrn.32 ROW4R, ROW6R
403 vtrn.32 ROW0R, ROW2R
404 vtrn.32 ROW5R, ROW7R
405
4061: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400407 vld1.s16 {d2}, [ip, :64] /* reload constants */
408 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
409 vmlal.s16 q6, ROW1L, XFIX_1_175875602
410 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
411 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
412 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
413 vmlal.s16 q7, ROW3L, XFIX_1_175875602
414 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
415 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
416 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
417 vmull.s16 q2, ROW2L, XFIX_0_541196100
418 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
419 vmov q4, q6
420 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
421 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
422 vshl.s32 q3, q3, #13
423 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
424 vadd.s32 q1, q3, q2
425 vmov q5, q7
426 vadd.s32 q1, q1, q6
427 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
428 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
429 vshrn.s32 ROW1L, q1, #16
430 vsub.s32 q1, q1, q6
431 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
432 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
433 vsub.s32 q1, q1, q6
434 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
435 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
436 vsub.s32 q3, q3, q2
437 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
438 vadd.s32 q1, q3, q5
439 vsub.s32 q3, q3, q5
440 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
441 vshrn.s32 ROW2L, q1, #16
442 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
443 vshl.s32 q5, q5, #13
444 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
445 vadd.s32 q2, q5, q6
446 vsub.s32 q1, q5, q6
447 vadd.s32 q6, q2, q7
448 vsub.s32 q2, q2, q7
449 vadd.s32 q5, q1, q4
450 vsub.s32 q3, q1, q4
451 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
452 vshrn.s32 ROW3L, q5, #16
453 vshrn.s32 ROW0L, q6, #16
454 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000455 /* 1-D IDCT, pass 2, right 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400456 vld1.s16 {d2}, [ip, :64] /* reload constants */
457 vmull.s16 q6, ROW5R, XFIX_1_175875602
458 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
459 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
460 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
461 vmull.s16 q7, ROW7R, XFIX_1_175875602
462 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
463 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
464 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
465 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
466 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
467 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
468 vmov q4, q6
469 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
470 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
471 vshl.s32 q3, q3, #13
472 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
473 vadd.s32 q1, q3, q2
474 vmov q5, q7
475 vadd.s32 q1, q1, q6
476 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
477 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
478 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
479 vsub.s32 q1, q1, q6
480 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
481 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
482 vsub.s32 q1, q1, q6
483 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
484 vmlal.s16 q6, ROW6R, XFIX_0_541196100
485 vsub.s32 q3, q3, q2
486 vshrn.s32 ROW6R, q1, #16
487 vadd.s32 q1, q3, q5
488 vsub.s32 q3, q3, q5
489 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
490 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
491 vshrn.s32 ROW5R, q3, #16
492 vshl.s32 q5, q5, #13
493 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
494 vadd.s32 q2, q5, q6
495 vsub.s32 q1, q5, q6
496 vadd.s32 q6, q2, q7
497 vsub.s32 q2, q2, q7
498 vadd.s32 q5, q1, q4
499 vsub.s32 q3, q1, q4
500 vshrn.s32 ROW7R, q2, #16
501 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
502 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
503 vshrn.s32 ROW4R, q3, #16
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000504
5052: /* Descale to 8-bit and range limit */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400506 vqrshrn.s16 d16, q8, #2
507 vqrshrn.s16 d17, q9, #2
508 vqrshrn.s16 d18, q10, #2
509 vqrshrn.s16 d19, q11, #2
510 vpop {d8-d15} /* restore NEON registers */
511 vqrshrn.s16 d20, q12, #2
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000512 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400513 vtrn.16 q8, q9
514 vqrshrn.s16 d21, q13, #2
515 vqrshrn.s16 d22, q14, #2
516 vmov.u8 q0, #(CENTERJSAMPLE)
517 vqrshrn.s16 d23, q15, #2
518 vtrn.8 d16, d17
519 vtrn.8 d18, d19
520 vadd.u8 q8, q8, q0
521 vadd.u8 q9, q9, q0
522 vtrn.16 q10, q11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000523 /* Store results to the output buffer */
524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
525 add TMP1, TMP1, OUTPUT_COL
526 add TMP2, TMP2, OUTPUT_COL
527 vst1.8 {d16}, [TMP1]
528 vtrn.8 d20, d21
529 vst1.8 {d17}, [TMP2]
530 ldmia OUTPUT_BUF!, {TMP1, TMP2}
531 add TMP1, TMP1, OUTPUT_COL
532 add TMP2, TMP2, OUTPUT_COL
533 vst1.8 {d18}, [TMP1]
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400534 vadd.u8 q10, q10, q0
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000535 vst1.8 {d19}, [TMP2]
536 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
537 add TMP1, TMP1, OUTPUT_COL
538 add TMP2, TMP2, OUTPUT_COL
539 add TMP3, TMP3, OUTPUT_COL
540 add TMP4, TMP4, OUTPUT_COL
541 vtrn.8 d22, d23
542 vst1.8 {d20}, [TMP1]
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400543 vadd.u8 q11, q11, q0
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000544 vst1.8 {d21}, [TMP2]
545 vst1.8 {d22}, [TMP3]
546 vst1.8 {d23}, [TMP4]
547 bx lr
548
5493: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
550
551 /* Transpose left 4x8 half */
552 vtrn.16 ROW6L, ROW7L
553 vtrn.16 ROW2L, ROW3L
554 vtrn.16 ROW0L, ROW1L
555 vtrn.16 ROW4L, ROW5L
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400556 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000557 vtrn.32 ROW1L, ROW3L
558 vtrn.32 ROW4L, ROW6L
559 vtrn.32 ROW0L, ROW2L
560 vtrn.32 ROW5L, ROW7L
561
562 cmp r0, #0
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400563 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
564 pass */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000565
566 /* Only row 0 is non-zero for the right 4x8 half */
567 vdup.s16 ROW1R, ROW0R[1]
568 vdup.s16 ROW2R, ROW0R[2]
569 vdup.s16 ROW3R, ROW0R[3]
570 vdup.s16 ROW4R, ROW0R[0]
571 vdup.s16 ROW5R, ROW0R[1]
572 vdup.s16 ROW6R, ROW0R[2]
573 vdup.s16 ROW7R, ROW0R[3]
574 vdup.s16 ROW0R, ROW0R[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400575 b 1b /* Go to 'normal' second pass */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000576
5774: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400578 vld1.s16 {d2}, [ip, :64] /* reload constants */
579 vmull.s16 q6, ROW1L, XFIX_1_175875602
580 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
581 vmull.s16 q7, ROW3L, XFIX_1_175875602
582 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
583 vmull.s16 q2, ROW2L, XFIX_0_541196100
584 vshll.s16 q3, ROW0L, #13
585 vmov q4, q6
586 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
587 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
588 vadd.s32 q1, q3, q2
589 vmov q5, q7
590 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
591 vadd.s32 q1, q1, q6
592 vadd.s32 q6, q6, q6
593 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
594 vshrn.s32 ROW1L, q1, #16
595 vsub.s32 q1, q1, q6
596 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
597 vsub.s32 q3, q3, q2
598 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
599 vadd.s32 q1, q3, q5
600 vsub.s32 q3, q3, q5
601 vshll.s16 q5, ROW0L, #13
602 vshrn.s32 ROW2L, q1, #16
603 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
604 vadd.s32 q2, q5, q6
605 vsub.s32 q1, q5, q6
606 vadd.s32 q6, q2, q7
607 vsub.s32 q2, q2, q7
608 vadd.s32 q5, q1, q4
609 vsub.s32 q3, q1, q4
610 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
611 vshrn.s32 ROW3L, q5, #16
612 vshrn.s32 ROW0L, q6, #16
613 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000614 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400615 vld1.s16 {d2}, [ip, :64] /* reload constants */
616 vmull.s16 q6, ROW5L, XFIX_1_175875602
617 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
618 vmull.s16 q7, ROW7L, XFIX_1_175875602
619 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
620 vmull.s16 q2, ROW6L, XFIX_0_541196100
621 vshll.s16 q3, ROW4L, #13
622 vmov q4, q6
623 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
624 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
625 vadd.s32 q1, q3, q2
626 vmov q5, q7
627 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
628 vadd.s32 q1, q1, q6
629 vadd.s32 q6, q6, q6
630 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
631 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
632 vsub.s32 q1, q1, q6
633 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
634 vsub.s32 q3, q3, q2
635 vshrn.s32 ROW6R, q1, #16
636 vadd.s32 q1, q3, q5
637 vsub.s32 q3, q3, q5
638 vshll.s16 q5, ROW4L, #13
639 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
640 vshrn.s32 ROW5R, q3, #16
641 vadd.s32 q2, q5, q6
642 vsub.s32 q1, q5, q6
643 vadd.s32 q6, q2, q7
644 vsub.s32 q2, q2, q7
645 vadd.s32 q5, q1, q4
646 vsub.s32 q3, q1, q4
647 vshrn.s32 ROW7R, q2, #16
648 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
649 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
650 vshrn.s32 ROW4R, q3, #16
651 b 2b /* Go to epilogue */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000652
653 .unreq DCT_TABLE
654 .unreq COEF_BLOCK
655 .unreq OUTPUT_BUF
656 .unreq OUTPUT_COL
657 .unreq TMP1
658 .unreq TMP2
659 .unreq TMP3
660 .unreq TMP4
661
662 .unreq ROW0L
663 .unreq ROW0R
664 .unreq ROW1L
665 .unreq ROW1R
666 .unreq ROW2L
667 .unreq ROW2R
668 .unreq ROW3L
669 .unreq ROW3R
670 .unreq ROW4L
671 .unreq ROW4R
672 .unreq ROW5L
673 .unreq ROW5R
674 .unreq ROW6L
675 .unreq ROW6R
676 .unreq ROW7L
677 .unreq ROW7R
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000678
noel@chromium.org3395bcc2014-04-14 06:56:00 +0000679
hbono@chromium.org98626972011-08-03 03:13:08 +0000680/*****************************************************************************/
681
682/*
683 * jsimd_idct_ifast_neon
684 *
685 * This function contains a fast, not so accurate integer implementation of
686 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000687 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
hbono@chromium.org98626972011-08-03 03:13:08 +0000688 * function from jidctfst.c
689 *
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000690 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
691 * But in ARM NEON case some extra additions are required because VQDMULH
692 * instruction can't handle the constants larger than 1. So the expressions
693 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
694 * which introduces an extra addition. Overall, there are 6 extra additions
695 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
hbono@chromium.org98626972011-08-03 03:13:08 +0000696 */
697
698#define XFIX_1_082392200 d0[0]
699#define XFIX_1_414213562 d0[1]
700#define XFIX_1_847759065 d0[2]
701#define XFIX_2_613125930 d0[3]
702
703.balign 16
704jsimd_idct_ifast_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400705 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
706 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
707 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
708 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
hbono@chromium.org98626972011-08-03 03:13:08 +0000709
hbono@chromium.org98626972011-08-03 03:13:08 +0000710asm_function jsimd_idct_ifast_neon
711
712 DCT_TABLE .req r0
713 COEF_BLOCK .req r1
714 OUTPUT_BUF .req r2
715 OUTPUT_COL .req r3
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000716 TMP1 .req r0
717 TMP2 .req r1
718 TMP3 .req r2
719 TMP4 .req ip
hbono@chromium.org98626972011-08-03 03:13:08 +0000720
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000721 /* Load and dequantize coefficients into NEON registers
722 * with the following allocation:
hbono@chromium.org98626972011-08-03 03:13:08 +0000723 * 0 1 2 3 | 4 5 6 7
724 * ---------+--------
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000725 * 0 | d16 | d17 ( q8 )
726 * 1 | d18 | d19 ( q9 )
727 * 2 | d20 | d21 ( q10 )
728 * 3 | d22 | d23 ( q11 )
729 * 4 | d24 | d25 ( q12 )
730 * 5 | d26 | d27 ( q13 )
731 * 6 | d28 | d29 ( q14 )
732 * 7 | d30 | d31 ( q15 )
hbono@chromium.org98626972011-08-03 03:13:08 +0000733 */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000734 adr ip, jsimd_idct_ifast_neon_consts
735 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400738 vmul.s16 q8, q8, q0
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000739 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400740 vmul.s16 q9, q9, q1
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000741 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
742 vmul.s16 q10, q10, q2
743 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
744 vmul.s16 q11, q11, q3
745 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
746 vmul.s16 q12, q12, q0
747 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
748 vmul.s16 q14, q14, q2
749 vmul.s16 q13, q13, q1
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400750 vld1.16 {d0}, [ip, :64] /* load constants */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000751 vmul.s16 q15, q15, q3
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400752 vpush {d8-d13} /* save NEON registers */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000753 /* 1-D IDCT, pass 1 */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400754 vsub.s16 q2, q10, q14
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000755 vadd.s16 q14, q10, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400756 vsub.s16 q1, q11, q13
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000757 vadd.s16 q13, q11, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400758 vsub.s16 q5, q9, q15
759 vadd.s16 q15, q9, q15
760 vqdmulh.s16 q4, q2, XFIX_1_414213562
761 vqdmulh.s16 q6, q1, XFIX_2_613125930
762 vadd.s16 q3, q1, q1
763 vsub.s16 q1, q5, q1
764 vadd.s16 q10, q2, q4
765 vqdmulh.s16 q4, q1, XFIX_1_847759065
766 vsub.s16 q2, q15, q13
767 vadd.s16 q3, q3, q6
768 vqdmulh.s16 q6, q2, XFIX_1_414213562
769 vadd.s16 q1, q1, q4
770 vqdmulh.s16 q4, q5, XFIX_1_082392200
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000771 vsub.s16 q10, q10, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400772 vadd.s16 q2, q2, q6
773 vsub.s16 q6, q8, q12
774 vadd.s16 q12, q8, q12
775 vadd.s16 q9, q5, q4
776 vadd.s16 q5, q6, q10
777 vsub.s16 q10, q6, q10
778 vadd.s16 q6, q15, q13
779 vadd.s16 q8, q12, q14
780 vsub.s16 q3, q6, q3
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000781 vsub.s16 q12, q12, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400782 vsub.s16 q3, q3, q1
783 vsub.s16 q1, q9, q1
784 vadd.s16 q2, q3, q2
785 vsub.s16 q15, q8, q6
786 vadd.s16 q1, q1, q2
787 vadd.s16 q8, q8, q6
788 vadd.s16 q14, q5, q3
789 vsub.s16 q9, q5, q3
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000790 vsub.s16 q13, q10, q2
791 vadd.s16 q10, q10, q2
792 /* Transpose */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400793 vtrn.16 q8, q9
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000794 vsub.s16 q11, q12, q1
795 vtrn.16 q14, q15
796 vadd.s16 q12, q12, q1
797 vtrn.16 q10, q11
798 vtrn.16 q12, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400799 vtrn.32 q9, q11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000800 vtrn.32 q12, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400801 vtrn.32 q8, q10
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000802 vtrn.32 q13, q15
803 vswp d28, d21
804 vswp d26, d19
805 /* 1-D IDCT, pass 2 */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400806 vsub.s16 q2, q10, q14
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000807 vswp d30, d23
808 vadd.s16 q14, q10, q14
809 vswp d24, d17
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400810 vsub.s16 q1, q11, q13
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000811 vadd.s16 q13, q11, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400812 vsub.s16 q5, q9, q15
813 vadd.s16 q15, q9, q15
814 vqdmulh.s16 q4, q2, XFIX_1_414213562
815 vqdmulh.s16 q6, q1, XFIX_2_613125930
816 vadd.s16 q3, q1, q1
817 vsub.s16 q1, q5, q1
818 vadd.s16 q10, q2, q4
819 vqdmulh.s16 q4, q1, XFIX_1_847759065
820 vsub.s16 q2, q15, q13
821 vadd.s16 q3, q3, q6
822 vqdmulh.s16 q6, q2, XFIX_1_414213562
823 vadd.s16 q1, q1, q4
824 vqdmulh.s16 q4, q5, XFIX_1_082392200
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000825 vsub.s16 q10, q10, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400826 vadd.s16 q2, q2, q6
827 vsub.s16 q6, q8, q12
828 vadd.s16 q12, q8, q12
829 vadd.s16 q9, q5, q4
830 vadd.s16 q5, q6, q10
831 vsub.s16 q10, q6, q10
832 vadd.s16 q6, q15, q13
833 vadd.s16 q8, q12, q14
834 vsub.s16 q3, q6, q3
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000835 vsub.s16 q12, q12, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400836 vsub.s16 q3, q3, q1
837 vsub.s16 q1, q9, q1
838 vadd.s16 q2, q3, q2
839 vsub.s16 q15, q8, q6
840 vadd.s16 q1, q1, q2
841 vadd.s16 q8, q8, q6
842 vadd.s16 q14, q5, q3
843 vsub.s16 q9, q5, q3
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000844 vsub.s16 q13, q10, q2
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400845 vpop {d8-d13} /* restore NEON registers */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000846 vadd.s16 q10, q10, q2
847 vsub.s16 q11, q12, q1
848 vadd.s16 q12, q12, q1
849 /* Descale to 8-bit and range limit */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400850 vmov.u8 q0, #0x80
851 vqshrn.s16 d16, q8, #5
852 vqshrn.s16 d17, q9, #5
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000853 vqshrn.s16 d18, q10, #5
854 vqshrn.s16 d19, q11, #5
855 vqshrn.s16 d20, q12, #5
856 vqshrn.s16 d21, q13, #5
857 vqshrn.s16 d22, q14, #5
858 vqshrn.s16 d23, q15, #5
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400859 vadd.u8 q8, q8, q0
860 vadd.u8 q9, q9, q0
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000861 vadd.u8 q10, q10, q0
862 vadd.u8 q11, q11, q0
863 /* Transpose the final 8-bit samples */
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400864 vtrn.16 q8, q9
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000865 vtrn.16 q10, q11
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400866 vtrn.32 q8, q10
867 vtrn.32 q9, q11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000868 vtrn.8 d16, d17
869 vtrn.8 d18, d19
870 /* Store results to the output buffer */
871 ldmia OUTPUT_BUF!, {TMP1, TMP2}
872 add TMP1, TMP1, OUTPUT_COL
873 add TMP2, TMP2, OUTPUT_COL
874 vst1.8 {d16}, [TMP1]
875 vst1.8 {d17}, [TMP2]
876 ldmia OUTPUT_BUF!, {TMP1, TMP2}
877 add TMP1, TMP1, OUTPUT_COL
878 add TMP2, TMP2, OUTPUT_COL
879 vst1.8 {d18}, [TMP1]
880 vtrn.8 d20, d21
881 vst1.8 {d19}, [TMP2]
882 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
883 add TMP1, TMP1, OUTPUT_COL
884 add TMP2, TMP2, OUTPUT_COL
885 add TMP3, TMP3, OUTPUT_COL
886 add TMP4, TMP4, OUTPUT_COL
887 vst1.8 {d20}, [TMP1]
888 vtrn.8 d22, d23
889 vst1.8 {d21}, [TMP2]
890 vst1.8 {d22}, [TMP3]
891 vst1.8 {d23}, [TMP4]
hbono@chromium.org98626972011-08-03 03:13:08 +0000892 bx lr
893
894 .unreq DCT_TABLE
895 .unreq COEF_BLOCK
896 .unreq OUTPUT_BUF
897 .unreq OUTPUT_COL
hbono@chromium.orgc6beb742011-11-29 05:16:26 +0000898 .unreq TMP1
899 .unreq TMP2
900 .unreq TMP3
901 .unreq TMP4
hbono@chromium.org98626972011-08-03 03:13:08 +0000902
noel@chromium.org3395bcc2014-04-14 06:56:00 +0000903
hbono@chromium.org98626972011-08-03 03:13:08 +0000904/*****************************************************************************/
905
906/*
907 * jsimd_idct_4x4_neon
908 *
909 * This function contains inverse-DCT code for getting reduced-size
910 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
911 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
912 * function from jpeg-6b (jidctred.c).
913 *
914 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
915 * requires much less arithmetic operations and hence should be faster.
916 * The primary purpose of this particular NEON optimized function is
917 * bit exact compatibility with jpeg-6b.
918 *
919 * TODO: a bit better instructions scheduling can be achieved by expanding
920 * idct_helper/transpose_4x4 macros and reordering instructions,
921 * but readability will suffer somewhat.
922 */
923
924#define CONST_BITS 13
925
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400926#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
927#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
928#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
929#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
930#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
931#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
932#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
933#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
934#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
935#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
936#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
937#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
938#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
939#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
hbono@chromium.org98626972011-08-03 03:13:08 +0000940
941.balign 16
942jsimd_idct_4x4_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400943 .short FIX_1_847759065 /* d0[0] */
944 .short -FIX_0_765366865 /* d0[1] */
945 .short -FIX_0_211164243 /* d0[2] */
946 .short FIX_1_451774981 /* d0[3] */
947 .short -FIX_2_172734803 /* d1[0] */
948 .short FIX_1_061594337 /* d1[1] */
949 .short -FIX_0_509795579 /* d1[2] */
950 .short -FIX_0_601344887 /* d1[3] */
951 .short FIX_0_899976223 /* d2[0] */
952 .short FIX_2_562915447 /* d2[1] */
953 .short 1 << (CONST_BITS+1) /* d2[2] */
954 .short 0 /* d2[3] */
hbono@chromium.org98626972011-08-03 03:13:08 +0000955
956.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400957 vmull.s16 q14, \x4, d2[2]
958 vmlal.s16 q14, \x8, d0[0]
hbono@chromium.org98626972011-08-03 03:13:08 +0000959 vmlal.s16 q14, \x14, d0[1]
960
961 vmull.s16 q13, \x16, d1[2]
962 vmlal.s16 q13, \x12, d1[3]
963 vmlal.s16 q13, \x10, d2[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400964 vmlal.s16 q13, \x6, d2[1]
hbono@chromium.org98626972011-08-03 03:13:08 +0000965
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400966 vmull.s16 q15, \x4, d2[2]
967 vmlsl.s16 q15, \x8, d0[0]
hbono@chromium.org98626972011-08-03 03:13:08 +0000968 vmlsl.s16 q15, \x14, d0[1]
969
970 vmull.s16 q12, \x16, d0[2]
971 vmlal.s16 q12, \x12, d0[3]
972 vmlal.s16 q12, \x10, d1[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400973 vmlal.s16 q12, \x6, d1[1]
hbono@chromium.org98626972011-08-03 03:13:08 +0000974
975 vadd.s32 q10, q14, q13
976 vsub.s32 q14, q14, q13
977
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400978 .if \shift > 16
979 vrshr.s32 q10, q10, #\shift
980 vrshr.s32 q14, q14, #\shift
hbono@chromium.org98626972011-08-03 03:13:08 +0000981 vmovn.s32 \y26, q10
982 vmovn.s32 \y29, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400983 .else
hbono@chromium.org98626972011-08-03 03:13:08 +0000984 vrshrn.s32 \y26, q10, #\shift
985 vrshrn.s32 \y29, q14, #\shift
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400986 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +0000987
988 vadd.s32 q10, q15, q12
989 vsub.s32 q15, q15, q12
990
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400991 .if \shift > 16
992 vrshr.s32 q10, q10, #\shift
993 vrshr.s32 q15, q15, #\shift
hbono@chromium.org98626972011-08-03 03:13:08 +0000994 vmovn.s32 \y27, q10
995 vmovn.s32 \y28, q15
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400996 .else
hbono@chromium.org98626972011-08-03 03:13:08 +0000997 vrshrn.s32 \y27, q10, #\shift
998 vrshrn.s32 \y28, q15, #\shift
Tom Hudson0d47d2d2016-05-04 13:22:56 -0400999 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +00001000.endm
1001
1002asm_function jsimd_idct_4x4_neon
1003
1004 DCT_TABLE .req r0
1005 COEF_BLOCK .req r1
1006 OUTPUT_BUF .req r2
1007 OUTPUT_COL .req r3
1008 TMP1 .req r0
1009 TMP2 .req r1
1010 TMP3 .req r2
1011 TMP4 .req ip
1012
1013 vpush {d8-d15}
1014
1015 /* Load constants (d3 is just used for padding) */
1016 adr TMP4, jsimd_idct_4x4_neon_consts
1017 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1018
1019 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1020 * 0 1 2 3 | 4 5 6 7
1021 * ---------+--------
1022 * 0 | d4 | d5
1023 * 1 | d6 | d7
1024 * 2 | d8 | d9
1025 * 3 | d10 | d11
1026 * 4 | - | -
1027 * 5 | d12 | d13
1028 * 6 | d14 | d15
1029 * 7 | d16 | d17
1030 */
1031 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1032 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1033 add COEF_BLOCK, COEF_BLOCK, #16
1034 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1035 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1036 /* dequantize */
1037 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1038 vmul.s16 q2, q2, q9
1039 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1040 vmul.s16 q3, q3, q10
1041 vmul.s16 q4, q4, q11
1042 add DCT_TABLE, DCT_TABLE, #16
1043 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1044 vmul.s16 q5, q5, q12
1045 vmul.s16 q6, q6, q13
1046 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1047 vmul.s16 q7, q7, q14
1048 vmul.s16 q8, q8, q15
1049
1050 /* Pass 1 */
1051 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1052 transpose_4x4 d4, d6, d8, d10
1053 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1054 transpose_4x4 d5, d7, d9, d11
1055
1056 /* Pass 2 */
1057 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1058 transpose_4x4 d26, d27, d28, d29
1059
1060 /* Range limit */
1061 vmov.u16 q15, #0x80
1062 vadd.s16 q13, q13, q15
1063 vadd.s16 q14, q14, q15
1064 vqmovun.s16 d26, q13
1065 vqmovun.s16 d27, q14
1066
1067 /* Store results to the output buffer */
1068 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1069 add TMP1, TMP1, OUTPUT_COL
1070 add TMP2, TMP2, OUTPUT_COL
1071 add TMP3, TMP3, OUTPUT_COL
1072 add TMP4, TMP4, OUTPUT_COL
1073
1074#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1075 /* We can use much less instructions on little endian systems if the
1076 * OS kernel is not configured to trap unaligned memory accesses
1077 */
1078 vst1.32 {d26[0]}, [TMP1]!
1079 vst1.32 {d27[0]}, [TMP3]!
1080 vst1.32 {d26[1]}, [TMP2]!
1081 vst1.32 {d27[1]}, [TMP4]!
1082#else
1083 vst1.8 {d26[0]}, [TMP1]!
1084 vst1.8 {d27[0]}, [TMP3]!
1085 vst1.8 {d26[1]}, [TMP1]!
1086 vst1.8 {d27[1]}, [TMP3]!
1087 vst1.8 {d26[2]}, [TMP1]!
1088 vst1.8 {d27[2]}, [TMP3]!
1089 vst1.8 {d26[3]}, [TMP1]!
1090 vst1.8 {d27[3]}, [TMP3]!
1091
1092 vst1.8 {d26[4]}, [TMP2]!
1093 vst1.8 {d27[4]}, [TMP4]!
1094 vst1.8 {d26[5]}, [TMP2]!
1095 vst1.8 {d27[5]}, [TMP4]!
1096 vst1.8 {d26[6]}, [TMP2]!
1097 vst1.8 {d27[6]}, [TMP4]!
1098 vst1.8 {d26[7]}, [TMP2]!
1099 vst1.8 {d27[7]}, [TMP4]!
1100#endif
1101
1102 vpop {d8-d15}
1103 bx lr
1104
1105 .unreq DCT_TABLE
1106 .unreq COEF_BLOCK
1107 .unreq OUTPUT_BUF
1108 .unreq OUTPUT_COL
1109 .unreq TMP1
1110 .unreq TMP2
1111 .unreq TMP3
1112 .unreq TMP4
hbono@chromium.org98626972011-08-03 03:13:08 +00001113
1114.purgem idct_helper
1115
noel@chromium.org3395bcc2014-04-14 06:56:00 +00001116
hbono@chromium.org98626972011-08-03 03:13:08 +00001117/*****************************************************************************/
1118
1119/*
1120 * jsimd_idct_2x2_neon
1121 *
1122 * This function contains inverse-DCT code for getting reduced-size
1123 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1124 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1125 * function from jpeg-6b (jidctred.c).
1126 *
1127 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1128 * requires much less arithmetic operations and hence should be faster.
1129 * The primary purpose of this particular NEON optimized function is
1130 * bit exact compatibility with jpeg-6b.
1131 */
1132
1133.balign 8
1134jsimd_idct_2x2_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001135 .short -FIX_0_720959822 /* d0[0] */
1136 .short FIX_0_850430095 /* d0[1] */
1137 .short -FIX_1_272758580 /* d0[2] */
1138 .short FIX_3_624509785 /* d0[3] */
hbono@chromium.org98626972011-08-03 03:13:08 +00001139
1140.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001141 vshll.s16 q14, \x4, #15
1142 vmull.s16 q13, \x6, d0[3]
1143 vmlal.s16 q13, \x10, d0[2]
1144 vmlal.s16 q13, \x12, d0[1]
1145 vmlal.s16 q13, \x16, d0[0]
hbono@chromium.org98626972011-08-03 03:13:08 +00001146
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001147 vadd.s32 q10, q14, q13
1148 vsub.s32 q14, q14, q13
hbono@chromium.org98626972011-08-03 03:13:08 +00001149
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001150 .if \shift > 16
1151 vrshr.s32 q10, q10, #\shift
1152 vrshr.s32 q14, q14, #\shift
1153 vmovn.s32 \y26, q10
1154 vmovn.s32 \y27, q14
1155 .else
1156 vrshrn.s32 \y26, q10, #\shift
1157 vrshrn.s32 \y27, q14, #\shift
1158 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +00001159.endm
1160
1161asm_function jsimd_idct_2x2_neon
1162
1163 DCT_TABLE .req r0
1164 COEF_BLOCK .req r1
1165 OUTPUT_BUF .req r2
1166 OUTPUT_COL .req r3
1167 TMP1 .req r0
1168 TMP2 .req ip
1169
1170 vpush {d8-d15}
1171
1172 /* Load constants */
1173 adr TMP2, jsimd_idct_2x2_neon_consts
1174 vld1.16 {d0}, [TMP2, :64]
1175
1176 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177 * 0 1 2 3 | 4 5 6 7
1178 * ---------+--------
1179 * 0 | d4 | d5
1180 * 1 | d6 | d7
1181 * 2 | - | -
1182 * 3 | d10 | d11
1183 * 4 | - | -
1184 * 5 | d12 | d13
1185 * 6 | - | -
1186 * 7 | d16 | d17
1187 */
1188 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189 add COEF_BLOCK, COEF_BLOCK, #16
1190 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1191 add COEF_BLOCK, COEF_BLOCK, #16
1192 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1193 add COEF_BLOCK, COEF_BLOCK, #16
1194 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1195 /* Dequantize */
1196 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197 vmul.s16 q2, q2, q9
1198 vmul.s16 q3, q3, q10
1199 add DCT_TABLE, DCT_TABLE, #16
1200 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1201 vmul.s16 q5, q5, q12
1202 add DCT_TABLE, DCT_TABLE, #16
1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1204 vmul.s16 q6, q6, q13
1205 add DCT_TABLE, DCT_TABLE, #16
1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1207 vmul.s16 q8, q8, q15
1208
1209 /* Pass 1 */
1210#if 0
1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001212 transpose_4x4 d4, d6, d8, d10
hbono@chromium.org98626972011-08-03 03:13:08 +00001213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001214 transpose_4x4 d5, d7, d9, d11
hbono@chromium.org98626972011-08-03 03:13:08 +00001215#else
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001216 vmull.s16 q13, d6, d0[3]
hbono@chromium.org98626972011-08-03 03:13:08 +00001217 vmlal.s16 q13, d10, d0[2]
1218 vmlal.s16 q13, d12, d0[1]
1219 vmlal.s16 q13, d16, d0[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001220 vmull.s16 q12, d7, d0[3]
hbono@chromium.org98626972011-08-03 03:13:08 +00001221 vmlal.s16 q12, d11, d0[2]
1222 vmlal.s16 q12, d13, d0[1]
1223 vmlal.s16 q12, d17, d0[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001224 vshll.s16 q14, d4, #15
1225 vshll.s16 q15, d5, #15
hbono@chromium.org98626972011-08-03 03:13:08 +00001226 vadd.s32 q10, q14, q13
1227 vsub.s32 q14, q14, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001228 vrshrn.s32 d4, q10, #13
1229 vrshrn.s32 d6, q14, #13
hbono@chromium.org98626972011-08-03 03:13:08 +00001230 vadd.s32 q10, q15, q12
1231 vsub.s32 q14, q15, q12
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001232 vrshrn.s32 d5, q10, #13
1233 vrshrn.s32 d7, q14, #13
1234 vtrn.16 q2, q3
1235 vtrn.32 q3, q5
hbono@chromium.org98626972011-08-03 03:13:08 +00001236#endif
1237
1238 /* Pass 2 */
1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1240
1241 /* Range limit */
1242 vmov.u16 q15, #0x80
1243 vadd.s16 q13, q13, q15
1244 vqmovun.s16 d26, q13
1245 vqmovun.s16 d27, q13
1246
1247 /* Store results to the output buffer */
1248 ldmia OUTPUT_BUF, {TMP1, TMP2}
1249 add TMP1, TMP1, OUTPUT_COL
1250 add TMP2, TMP2, OUTPUT_COL
1251
1252 vst1.8 {d26[0]}, [TMP1]!
1253 vst1.8 {d27[4]}, [TMP1]!
1254 vst1.8 {d26[1]}, [TMP2]!
1255 vst1.8 {d27[5]}, [TMP2]!
1256
1257 vpop {d8-d15}
1258 bx lr
1259
1260 .unreq DCT_TABLE
1261 .unreq COEF_BLOCK
1262 .unreq OUTPUT_BUF
1263 .unreq OUTPUT_COL
1264 .unreq TMP1
1265 .unreq TMP2
hbono@chromium.org98626972011-08-03 03:13:08 +00001266
1267.purgem idct_helper
1268
noel@chromium.org3395bcc2014-04-14 06:56:00 +00001269
hbono@chromium.org98626972011-08-03 03:13:08 +00001270/*****************************************************************************/
1271
1272/*
1273 * jsimd_ycc_extrgb_convert_neon
1274 * jsimd_ycc_extbgr_convert_neon
1275 * jsimd_ycc_extrgbx_convert_neon
1276 * jsimd_ycc_extbgrx_convert_neon
1277 * jsimd_ycc_extxbgr_convert_neon
1278 * jsimd_ycc_extxrgb_convert_neon
1279 *
1280 * Colorspace conversion YCbCr -> RGB
1281 */
1282
1283
1284.macro do_load size
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001285 .if \size == 8
1286 vld1.8 {d4}, [U, :64]!
1287 vld1.8 {d5}, [V, :64]!
1288 vld1.8 {d0}, [Y, :64]!
1289 pld [U, #64]
1290 pld [V, #64]
1291 pld [Y, #64]
1292 .elseif \size == 4
1293 vld1.8 {d4[0]}, [U]!
1294 vld1.8 {d4[1]}, [U]!
1295 vld1.8 {d4[2]}, [U]!
1296 vld1.8 {d4[3]}, [U]!
1297 vld1.8 {d5[0]}, [V]!
1298 vld1.8 {d5[1]}, [V]!
1299 vld1.8 {d5[2]}, [V]!
1300 vld1.8 {d5[3]}, [V]!
1301 vld1.8 {d0[0]}, [Y]!
1302 vld1.8 {d0[1]}, [Y]!
1303 vld1.8 {d0[2]}, [Y]!
1304 vld1.8 {d0[3]}, [Y]!
1305 .elseif \size == 2
1306 vld1.8 {d4[4]}, [U]!
1307 vld1.8 {d4[5]}, [U]!
1308 vld1.8 {d5[4]}, [V]!
1309 vld1.8 {d5[5]}, [V]!
1310 vld1.8 {d0[4]}, [Y]!
1311 vld1.8 {d0[5]}, [Y]!
1312 .elseif \size == 1
1313 vld1.8 {d4[6]}, [U]!
1314 vld1.8 {d5[6]}, [V]!
1315 vld1.8 {d0[6]}, [Y]!
1316 .else
1317 .error unsupported macroblock size
1318 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +00001319.endm
1320
1321.macro do_store bpp, size
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001322 .if \bpp == 24
1323 .if \size == 8
1324 vst3.8 {d10, d11, d12}, [RGB]!
1325 .elseif \size == 4
1326 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1327 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1328 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1329 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1330 .elseif \size == 2
1331 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1332 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1333 .elseif \size == 1
1334 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
hbono@chromium.org98626972011-08-03 03:13:08 +00001335 .else
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001336 .error unsupported macroblock size
hbono@chromium.org98626972011-08-03 03:13:08 +00001337 .endif
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001338 .elseif \bpp == 32
1339 .if \size == 8
1340 vst4.8 {d10, d11, d12, d13}, [RGB]!
1341 .elseif \size == 4
1342 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1343 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1344 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1345 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1346 .elseif \size == 2
1347 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1348 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1349 .elseif \size == 1
1350 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1351 .else
1352 .error unsupported macroblock size
1353 .endif
1354 .elseif \bpp == 16
1355 .if \size == 8
1356 vst1.16 {q15}, [RGB]!
1357 .elseif \size == 4
1358 vst1.16 {d30}, [RGB]!
1359 .elseif \size == 2
1360 vst1.16 {d31[0]}, [RGB]!
1361 vst1.16 {d31[1]}, [RGB]!
1362 .elseif \size == 1
1363 vst1.16 {d31[2]}, [RGB]!
1364 .else
1365 .error unsupported macroblock size
1366 .endif
1367 .else
1368 .error unsupported bpp
1369 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +00001370.endm
1371
1372.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1373
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001374/*
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001375 * 2-stage pipelined YCbCr->RGB conversion
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001376 */
1377
1378.macro do_yuv_to_rgb_stage1
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001379 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1380 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1381 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1382 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1383 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1384 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1385 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1386 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1387 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1388 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001389.endm
1390
1391.macro do_yuv_to_rgb_stage2
hbono@chromium.org98626972011-08-03 03:13:08 +00001392 vrshrn.s32 d20, q10, #15
1393 vrshrn.s32 d21, q11, #15
1394 vrshrn.s32 d24, q12, #14
1395 vrshrn.s32 d25, q13, #14
1396 vrshrn.s32 d28, q14, #14
1397 vrshrn.s32 d29, q15, #14
Aaron Gablefeec46f2015-08-06 09:54:48 -07001398 vaddw.u8 q11, q10, d0
hbono@chromium.org98626972011-08-03 03:13:08 +00001399 vaddw.u8 q12, q12, d0
1400 vaddw.u8 q14, q14, d0
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001401 .if \bpp != 16
Aaron Gablefeec46f2015-08-06 09:54:48 -07001402 vqmovun.s16 d1\g_offs, q11
hbono@chromium.org98626972011-08-03 03:13:08 +00001403 vqmovun.s16 d1\r_offs, q12
1404 vqmovun.s16 d1\b_offs, q14
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001405 .else /* rgb565 */
Aaron Gablefeec46f2015-08-06 09:54:48 -07001406 vqshlu.s16 q13, q11, #8
1407 vqshlu.s16 q15, q12, #8
1408 vqshlu.s16 q14, q14, #8
1409 vsri.u16 q15, q13, #5
1410 vsri.u16 q15, q14, #11
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001411 .endif
hbono@chromium.org98626972011-08-03 03:13:08 +00001412.endm
1413
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001414.macro do_yuv_to_rgb_stage2_store_load_stage1
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001415 /* "do_yuv_to_rgb_stage2" and "store" */
1416 vrshrn.s32 d20, q10, #15
Aaron Gablefeec46f2015-08-06 09:54:48 -07001417 /* "load" and "do_yuv_to_rgb_stage1" */
1418 pld [U, #64]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001419 vrshrn.s32 d21, q11, #15
Aaron Gablefeec46f2015-08-06 09:54:48 -07001420 pld [V, #64]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001421 vrshrn.s32 d24, q12, #14
1422 vrshrn.s32 d25, q13, #14
Aaron Gablefeec46f2015-08-06 09:54:48 -07001423 vld1.8 {d4}, [U, :64]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001424 vrshrn.s32 d28, q14, #14
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001425 vld1.8 {d5}, [V, :64]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001426 vrshrn.s32 d29, q15, #14
1427 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1428 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1429 vaddw.u8 q11, q10, d0
1430 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1431 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1432 vaddw.u8 q12, q12, d0
1433 vaddw.u8 q14, q14, d0
1434 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1435 vqmovun.s16 d1\g_offs, q11
Aaron Gablefeec46f2015-08-06 09:54:48 -07001436 pld [Y, #64]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001437 vqmovun.s16 d1\r_offs, q12
Aaron Gablefeec46f2015-08-06 09:54:48 -07001438 vld1.8 {d0}, [Y, :64]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001439 vqmovun.s16 d1\b_offs, q14
1440 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1441 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1442 do_store \bpp, 8
1443 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1444 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1445 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1446 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1447 .else /**************************** rgb565 ********************************/
1448 vqshlu.s16 q13, q11, #8
Aaron Gablefeec46f2015-08-06 09:54:48 -07001449 pld [Y, #64]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001450 vqshlu.s16 q15, q12, #8
1451 vqshlu.s16 q14, q14, #8
Aaron Gablefeec46f2015-08-06 09:54:48 -07001452 vld1.8 {d0}, [Y, :64]!
1453 vmull.s16 q11, d7, d1[1]
1454 vmlal.s16 q11, d9, d1[2]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001455 vsri.u16 q15, q13, #5
Aaron Gablefeec46f2015-08-06 09:54:48 -07001456 vmull.s16 q12, d8, d1[0]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001457 vsri.u16 q15, q14, #11
Aaron Gablefeec46f2015-08-06 09:54:48 -07001458 vmull.s16 q13, d9, d1[0]
1459 vmull.s16 q14, d6, d1[3]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001460 do_store \bpp, 8
Aaron Gablefeec46f2015-08-06 09:54:48 -07001461 vmull.s16 q15, d7, d1[3]
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001462 .endif
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001463.endm
1464
1465.macro do_yuv_to_rgb
1466 do_yuv_to_rgb_stage1
1467 do_yuv_to_rgb_stage2
1468.endm
1469
hbono@chromium.org98626972011-08-03 03:13:08 +00001470/* Apple gas crashes on adrl, work around that by using adr.
1471 * But this requires a copy of these constants for each function.
1472 */
1473
1474.balign 16
1475jsimd_ycc_\colorid\()_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001476 .short 0, 0, 0, 0
1477 .short 22971, -11277, -23401, 29033
1478 .short -128, -128, -128, -128
1479 .short -128, -128, -128, -128
hbono@chromium.org98626972011-08-03 03:13:08 +00001480
1481asm_function jsimd_ycc_\colorid\()_convert_neon
1482 OUTPUT_WIDTH .req r0
1483 INPUT_BUF .req r1
1484 INPUT_ROW .req r2
1485 OUTPUT_BUF .req r3
1486 NUM_ROWS .req r4
1487
1488 INPUT_BUF0 .req r5
1489 INPUT_BUF1 .req r6
1490 INPUT_BUF2 .req INPUT_BUF
1491
1492 RGB .req r7
1493 Y .req r8
1494 U .req r9
1495 V .req r10
1496 N .req ip
1497
1498 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1499 adr ip, jsimd_ycc_\colorid\()_neon_consts
1500 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1501
1502 /* Save ARM registers and handle input arguments */
1503 push {r4, r5, r6, r7, r8, r9, r10, lr}
1504 ldr NUM_ROWS, [sp, #(4 * 8)]
1505 ldr INPUT_BUF0, [INPUT_BUF]
1506 ldr INPUT_BUF1, [INPUT_BUF, #4]
1507 ldr INPUT_BUF2, [INPUT_BUF, #8]
1508 .unreq INPUT_BUF
1509
1510 /* Save NEON registers */
1511 vpush {d8-d15}
1512
1513 /* Initially set d10, d11, d12, d13 to 0xFF */
1514 vmov.u8 q5, #255
1515 vmov.u8 q6, #255
1516
1517 /* Outer loop over scanlines */
1518 cmp NUM_ROWS, #1
1519 blt 9f
15200:
1521 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1522 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1523 mov N, OUTPUT_WIDTH
1524 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1525 add INPUT_ROW, INPUT_ROW, #1
1526 ldr RGB, [OUTPUT_BUF], #4
1527
1528 /* Inner loop over pixels */
1529 subs N, N, #8
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001530 blt 3f
1531 do_load 8
1532 do_yuv_to_rgb_stage1
1533 subs N, N, #8
hbono@chromium.org98626972011-08-03 03:13:08 +00001534 blt 2f
15351:
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001536 do_yuv_to_rgb_stage2_store_load_stage1
hbono@chromium.org98626972011-08-03 03:13:08 +00001537 subs N, N, #8
1538 bge 1b
hbono@chromium.orgc6beb742011-11-29 05:16:26 +000015392:
1540 do_yuv_to_rgb_stage2
1541 do_store \bpp, 8
hbono@chromium.org98626972011-08-03 03:13:08 +00001542 tst N, #7
1543 beq 8f
hbono@chromium.orgc6beb742011-11-29 05:16:26 +000015443:
hbono@chromium.org98626972011-08-03 03:13:08 +00001545 tst N, #4
1546 beq 3f
1547 do_load 4
15483:
1549 tst N, #2
1550 beq 4f
1551 do_load 2
15524:
1553 tst N, #1
1554 beq 5f
1555 do_load 1
15565:
1557 do_yuv_to_rgb
1558 tst N, #4
1559 beq 6f
1560 do_store \bpp, 4
15616:
1562 tst N, #2
1563 beq 7f
1564 do_store \bpp, 2
15657:
1566 tst N, #1
1567 beq 8f
1568 do_store \bpp, 1
15698:
1570 subs NUM_ROWS, NUM_ROWS, #1
1571 bgt 0b
15729:
1573 /* Restore all registers and return */
1574 vpop {d8-d15}
1575 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1576
1577 .unreq OUTPUT_WIDTH
1578 .unreq INPUT_ROW
1579 .unreq OUTPUT_BUF
1580 .unreq NUM_ROWS
1581 .unreq INPUT_BUF0
1582 .unreq INPUT_BUF1
1583 .unreq INPUT_BUF2
1584 .unreq RGB
1585 .unreq Y
1586 .unreq U
1587 .unreq V
1588 .unreq N
hbono@chromium.org98626972011-08-03 03:13:08 +00001589
1590.purgem do_yuv_to_rgb
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001591.purgem do_yuv_to_rgb_stage1
1592.purgem do_yuv_to_rgb_stage2
1593.purgem do_yuv_to_rgb_stage2_store_load_stage1
hbono@chromium.org98626972011-08-03 03:13:08 +00001594
1595.endm
1596
1597/*--------------------------------- id ----- bpp R G B */
1598generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1599generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1600generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1601generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1602generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1603generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
Aaron Gablefeec46f2015-08-06 09:54:48 -07001604generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
hbono@chromium.org98626972011-08-03 03:13:08 +00001605
1606.purgem do_load
1607.purgem do_store
1608
noel@chromium.org3395bcc2014-04-14 06:56:00 +00001609
hbono@chromium.org98626972011-08-03 03:13:08 +00001610/*****************************************************************************/
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001611
1612/*
1613 * jsimd_extrgb_ycc_convert_neon
1614 * jsimd_extbgr_ycc_convert_neon
1615 * jsimd_extrgbx_ycc_convert_neon
1616 * jsimd_extbgrx_ycc_convert_neon
1617 * jsimd_extxbgr_ycc_convert_neon
1618 * jsimd_extxrgb_ycc_convert_neon
1619 *
1620 * Colorspace conversion RGB -> YCbCr
1621 */
1622
1623.macro do_store size
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001624 .if \size == 8
1625 vst1.8 {d20}, [Y]!
1626 vst1.8 {d21}, [U]!
1627 vst1.8 {d22}, [V]!
1628 .elseif \size == 4
1629 vst1.8 {d20[0]}, [Y]!
1630 vst1.8 {d20[1]}, [Y]!
1631 vst1.8 {d20[2]}, [Y]!
1632 vst1.8 {d20[3]}, [Y]!
1633 vst1.8 {d21[0]}, [U]!
1634 vst1.8 {d21[1]}, [U]!
1635 vst1.8 {d21[2]}, [U]!
1636 vst1.8 {d21[3]}, [U]!
1637 vst1.8 {d22[0]}, [V]!
1638 vst1.8 {d22[1]}, [V]!
1639 vst1.8 {d22[2]}, [V]!
1640 vst1.8 {d22[3]}, [V]!
1641 .elseif \size == 2
1642 vst1.8 {d20[4]}, [Y]!
1643 vst1.8 {d20[5]}, [Y]!
1644 vst1.8 {d21[4]}, [U]!
1645 vst1.8 {d21[5]}, [U]!
1646 vst1.8 {d22[4]}, [V]!
1647 vst1.8 {d22[5]}, [V]!
1648 .elseif \size == 1
1649 vst1.8 {d20[6]}, [Y]!
1650 vst1.8 {d21[6]}, [U]!
1651 vst1.8 {d22[6]}, [V]!
1652 .else
1653 .error unsupported macroblock size
1654 .endif
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001655.endm
1656
1657.macro do_load bpp, size
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001658 .if \bpp == 24
1659 .if \size == 8
1660 vld3.8 {d10, d11, d12}, [RGB]!
1661 pld [RGB, #128]
1662 .elseif \size == 4
1663 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1664 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1665 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1666 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1667 .elseif \size == 2
1668 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1669 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1670 .elseif \size == 1
1671 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001672 .else
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001673 .error unsupported macroblock size
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001674 .endif
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001675 .elseif \bpp == 32
1676 .if \size == 8
1677 vld4.8 {d10, d11, d12, d13}, [RGB]!
1678 pld [RGB, #128]
1679 .elseif \size == 4
1680 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1681 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1682 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1683 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1684 .elseif \size == 2
1685 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1686 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1687 .elseif \size == 1
1688 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1689 .else
1690 .error unsupported macroblock size
1691 .endif
1692 .else
1693 .error unsupported bpp
1694 .endif
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001695.endm
1696
1697.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1698
1699/*
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001700 * 2-stage pipelined RGB->YCbCr conversion
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001701 */
1702
1703.macro do_rgb_to_yuv_stage1
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001704 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1705 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1706 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1707 vmull.u16 q7, d4, d0[0]
1708 vmlal.u16 q7, d6, d0[1]
1709 vmlal.u16 q7, d8, d0[2]
1710 vmull.u16 q8, d5, d0[0]
1711 vmlal.u16 q8, d7, d0[1]
1712 vmlal.u16 q8, d9, d0[2]
1713 vrev64.32 q9, q1
1714 vrev64.32 q13, q1
1715 vmlsl.u16 q9, d4, d0[3]
1716 vmlsl.u16 q9, d6, d1[0]
1717 vmlal.u16 q9, d8, d1[1]
1718 vmlsl.u16 q13, d5, d0[3]
1719 vmlsl.u16 q13, d7, d1[0]
1720 vmlal.u16 q13, d9, d1[1]
1721 vrev64.32 q14, q1
1722 vrev64.32 q15, q1
1723 vmlal.u16 q14, d4, d1[1]
1724 vmlsl.u16 q14, d6, d1[2]
1725 vmlsl.u16 q14, d8, d1[3]
1726 vmlal.u16 q15, d5, d1[1]
1727 vmlsl.u16 q15, d7, d1[2]
1728 vmlsl.u16 q15, d9, d1[3]
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001729.endm
1730
1731.macro do_rgb_to_yuv_stage2
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001732 vrshrn.u32 d20, q7, #16
1733 vrshrn.u32 d21, q8, #16
1734 vshrn.u32 d22, q9, #16
1735 vshrn.u32 d23, q13, #16
1736 vshrn.u32 d24, q14, #16
1737 vshrn.u32 d25, q15, #16
1738 vmovn.u16 d20, q10 /* d20 = y */
1739 vmovn.u16 d21, q11 /* d21 = u */
1740 vmovn.u16 d22, q12 /* d22 = v */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001741.endm
1742
1743.macro do_rgb_to_yuv
1744 do_rgb_to_yuv_stage1
1745 do_rgb_to_yuv_stage2
1746.endm
1747
1748.macro do_rgb_to_yuv_stage2_store_load_stage1
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001749 vrshrn.u32 d20, q7, #16
1750 vrshrn.u32 d21, q8, #16
1751 vshrn.u32 d22, q9, #16
1752 vrev64.32 q9, q1
1753 vshrn.u32 d23, q13, #16
1754 vrev64.32 q13, q1
1755 vshrn.u32 d24, q14, #16
1756 vshrn.u32 d25, q15, #16
1757 do_load \bpp, 8
1758 vmovn.u16 d20, q10 /* d20 = y */
1759 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1760 vmovn.u16 d21, q11 /* d21 = u */
1761 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1762 vmovn.u16 d22, q12 /* d22 = v */
1763 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1764 vmull.u16 q7, d4, d0[0]
1765 vmlal.u16 q7, d6, d0[1]
1766 vmlal.u16 q7, d8, d0[2]
1767 vst1.8 {d20}, [Y]!
1768 vmull.u16 q8, d5, d0[0]
1769 vmlal.u16 q8, d7, d0[1]
1770 vmlal.u16 q8, d9, d0[2]
1771 vmlsl.u16 q9, d4, d0[3]
1772 vmlsl.u16 q9, d6, d1[0]
1773 vmlal.u16 q9, d8, d1[1]
1774 vst1.8 {d21}, [U]!
1775 vmlsl.u16 q13, d5, d0[3]
1776 vmlsl.u16 q13, d7, d1[0]
1777 vmlal.u16 q13, d9, d1[1]
1778 vrev64.32 q14, q1
1779 vrev64.32 q15, q1
1780 vmlal.u16 q14, d4, d1[1]
1781 vmlsl.u16 q14, d6, d1[2]
1782 vmlsl.u16 q14, d8, d1[3]
1783 vst1.8 {d22}, [V]!
1784 vmlal.u16 q15, d5, d1[1]
1785 vmlsl.u16 q15, d7, d1[2]
1786 vmlsl.u16 q15, d9, d1[3]
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001787.endm
1788
1789.balign 16
1790jsimd_\colorid\()_ycc_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -04001791 .short 19595, 38470, 7471, 11059
1792 .short 21709, 32768, 27439, 5329
1793 .short 32767, 128, 32767, 128
1794 .short 32767, 128, 32767, 128
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001795
1796asm_function jsimd_\colorid\()_ycc_convert_neon
1797 OUTPUT_WIDTH .req r0
1798 INPUT_BUF .req r1
1799 OUTPUT_BUF .req r2
1800 OUTPUT_ROW .req r3
1801 NUM_ROWS .req r4
1802
1803 OUTPUT_BUF0 .req r5
1804 OUTPUT_BUF1 .req r6
1805 OUTPUT_BUF2 .req OUTPUT_BUF
1806
1807 RGB .req r7
1808 Y .req r8
1809 U .req r9
1810 V .req r10
1811 N .req ip
1812
1813 /* Load constants to d0, d1, d2, d3 */
1814 adr ip, jsimd_\colorid\()_ycc_neon_consts
1815 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1816
1817 /* Save ARM registers and handle input arguments */
1818 push {r4, r5, r6, r7, r8, r9, r10, lr}
1819 ldr NUM_ROWS, [sp, #(4 * 8)]
1820 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1821 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1822 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1823 .unreq OUTPUT_BUF
1824
1825 /* Save NEON registers */
1826 vpush {d8-d15}
1827
1828 /* Outer loop over scanlines */
1829 cmp NUM_ROWS, #1
1830 blt 9f
18310:
1832 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1833 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1834 mov N, OUTPUT_WIDTH
1835 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1836 add OUTPUT_ROW, OUTPUT_ROW, #1
1837 ldr RGB, [INPUT_BUF], #4
1838
1839 /* Inner loop over pixels */
1840 subs N, N, #8
1841 blt 3f
1842 do_load \bpp, 8
1843 do_rgb_to_yuv_stage1
1844 subs N, N, #8
1845 blt 2f
18461:
1847 do_rgb_to_yuv_stage2_store_load_stage1
1848 subs N, N, #8
1849 bge 1b
18502:
1851 do_rgb_to_yuv_stage2
1852 do_store 8
1853 tst N, #7
1854 beq 8f
18553:
1856 tst N, #4
1857 beq 3f
1858 do_load \bpp, 4
18593:
1860 tst N, #2
1861 beq 4f
1862 do_load \bpp, 2
18634:
1864 tst N, #1
1865 beq 5f
1866 do_load \bpp, 1
18675:
1868 do_rgb_to_yuv
1869 tst N, #4
1870 beq 6f
1871 do_store 4
18726:
1873 tst N, #2
1874 beq 7f
1875 do_store 2
18767:
1877 tst N, #1
1878 beq 8f
1879 do_store 1
18808:
1881 subs NUM_ROWS, NUM_ROWS, #1
1882 bgt 0b
18839:
1884 /* Restore all registers and return */
1885 vpop {d8-d15}
1886 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1887
1888 .unreq OUTPUT_WIDTH
1889 .unreq OUTPUT_ROW
1890 .unreq INPUT_BUF
1891 .unreq NUM_ROWS
1892 .unreq OUTPUT_BUF0
1893 .unreq OUTPUT_BUF1
1894 .unreq OUTPUT_BUF2
1895 .unreq RGB
1896 .unreq Y
1897 .unreq U
1898 .unreq V
1899 .unreq N
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001900
1901.purgem do_rgb_to_yuv
1902.purgem do_rgb_to_yuv_stage1
1903.purgem do_rgb_to_yuv_stage2
1904.purgem do_rgb_to_yuv_stage2_store_load_stage1
1905
1906.endm
1907
1908/*--------------------------------- id ----- bpp R G B */
1909generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1910generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1911generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1912generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1913generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1914generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1915
1916.purgem do_load
1917.purgem do_store
1918
noel@chromium.org3395bcc2014-04-14 06:56:00 +00001919
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001920/*****************************************************************************/
1921
1922/*
1923 * Load data into workspace, applying unsigned->signed conversion
1924 *
1925 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1926 * rid of VST1.16 instructions
1927 */
1928
1929asm_function jsimd_convsamp_neon
1930 SAMPLE_DATA .req r0
1931 START_COL .req r1
1932 WORKSPACE .req r2
1933 TMP1 .req r3
1934 TMP2 .req r4
1935 TMP3 .req r5
1936 TMP4 .req ip
1937
1938 push {r4, r5}
1939 vmov.u8 d0, #128
1940
1941 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1942 add TMP1, TMP1, START_COL
1943 add TMP2, TMP2, START_COL
1944 add TMP3, TMP3, START_COL
1945 add TMP4, TMP4, START_COL
1946 vld1.8 {d16}, [TMP1]
1947 vsubl.u8 q8, d16, d0
1948 vld1.8 {d18}, [TMP2]
1949 vsubl.u8 q9, d18, d0
1950 vld1.8 {d20}, [TMP3]
1951 vsubl.u8 q10, d20, d0
1952 vld1.8 {d22}, [TMP4]
1953 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1954 vsubl.u8 q11, d22, d0
1955 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1956 add TMP1, TMP1, START_COL
1957 add TMP2, TMP2, START_COL
1958 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1959 add TMP3, TMP3, START_COL
1960 add TMP4, TMP4, START_COL
1961 vld1.8 {d24}, [TMP1]
1962 vsubl.u8 q12, d24, d0
1963 vld1.8 {d26}, [TMP2]
1964 vsubl.u8 q13, d26, d0
1965 vld1.8 {d28}, [TMP3]
1966 vsubl.u8 q14, d28, d0
1967 vld1.8 {d30}, [TMP4]
1968 vsubl.u8 q15, d30, d0
1969 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1970 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1971 pop {r4, r5}
1972 bx lr
1973
1974 .unreq SAMPLE_DATA
1975 .unreq START_COL
1976 .unreq WORKSPACE
1977 .unreq TMP1
1978 .unreq TMP2
1979 .unreq TMP3
1980 .unreq TMP4
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001981
noel@chromium.org3395bcc2014-04-14 06:56:00 +00001982
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00001983/*****************************************************************************/
1984
1985/*
1986 * jsimd_fdct_ifast_neon
1987 *
1988 * This function contains a fast, not so accurate integer implementation of
1989 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1990 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1991 * function from jfdctfst.c
1992 *
1993 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1994 * rid of a bunch of VLD1.16 instructions
1995 */
1996
1997#define XFIX_0_382683433 d0[0]
1998#define XFIX_0_541196100 d0[1]
1999#define XFIX_0_707106781 d0[2]
2000#define XFIX_1_306562965 d0[3]
2001
2002.balign 16
2003jsimd_fdct_ifast_neon_consts:
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002004 .short (98 * 128) /* XFIX_0_382683433 */
2005 .short (139 * 128) /* XFIX_0_541196100 */
2006 .short (181 * 128) /* XFIX_0_707106781 */
2007 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002008
2009asm_function jsimd_fdct_ifast_neon
2010
2011 DATA .req r0
2012 TMP .req ip
2013
2014 vpush {d8-d15}
2015
2016 /* Load constants */
2017 adr TMP, jsimd_fdct_ifast_neon_consts
2018 vld1.16 {d0}, [TMP, :64]
2019
2020 /* Load all DATA into NEON registers with the following allocation:
2021 * 0 1 2 3 | 4 5 6 7
2022 * ---------+--------
2023 * 0 | d16 | d17 | q8
2024 * 1 | d18 | d19 | q9
2025 * 2 | d20 | d21 | q10
2026 * 3 | d22 | d23 | q11
2027 * 4 | d24 | d25 | q12
2028 * 5 | d26 | d27 | q13
2029 * 6 | d28 | d29 | q14
2030 * 7 | d30 | d31 | q15
2031 */
2032
2033 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
2034 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
2035 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
2036 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2037 sub DATA, DATA, #(128 - 32)
2038
2039 mov TMP, #2
20401:
2041 /* Transpose */
2042 vtrn.16 q12, q13
2043 vtrn.16 q10, q11
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002044 vtrn.16 q8, q9
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002045 vtrn.16 q14, q15
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002046 vtrn.32 q9, q11
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002047 vtrn.32 q13, q15
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002048 vtrn.32 q8, q10
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002049 vtrn.32 q12, q14
2050 vswp d30, d23
2051 vswp d24, d17
2052 vswp d26, d19
2053 /* 1-D FDCT */
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002054 vadd.s16 q2, q11, q12
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002055 vswp d28, d21
2056 vsub.s16 q12, q11, q12
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002057 vsub.s16 q6, q10, q13
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002058 vadd.s16 q10, q10, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002059 vsub.s16 q7, q9, q14
2060 vadd.s16 q9, q9, q14
2061 vsub.s16 q1, q8, q15
2062 vadd.s16 q8, q8, q15
2063 vsub.s16 q4, q9, q10
2064 vsub.s16 q5, q8, q2
2065 vadd.s16 q3, q9, q10
2066 vadd.s16 q4, q4, q5
2067 vadd.s16 q2, q8, q2
2068 vqdmulh.s16 q4, q4, XFIX_0_707106781
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002069 vadd.s16 q11, q12, q6
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002070 vadd.s16 q8, q2, q3
2071 vsub.s16 q12, q2, q3
2072 vadd.s16 q3, q6, q7
2073 vadd.s16 q7, q7, q1
2074 vqdmulh.s16 q3, q3, XFIX_0_707106781
2075 vsub.s16 q6, q11, q7
2076 vadd.s16 q10, q5, q4
2077 vqdmulh.s16 q6, q6, XFIX_0_382683433
2078 vsub.s16 q14, q5, q4
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002079 vqdmulh.s16 q11, q11, XFIX_0_541196100
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002080 vqdmulh.s16 q5, q7, XFIX_1_306562965
2081 vadd.s16 q4, q1, q3
2082 vsub.s16 q3, q1, q3
2083 vadd.s16 q7, q7, q6
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002084 vadd.s16 q11, q11, q6
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002085 vadd.s16 q7, q7, q5
2086 vadd.s16 q13, q3, q11
2087 vsub.s16 q11, q3, q11
2088 vadd.s16 q9, q4, q7
2089 vsub.s16 q15, q4, q7
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002090 subs TMP, TMP, #1
2091 bne 1b
2092
2093 /* store results */
2094 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2095 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2096 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2097 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2098
2099 vpop {d8-d15}
2100 bx lr
2101
2102 .unreq DATA
2103 .unreq TMP
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002104
noel@chromium.org3395bcc2014-04-14 06:56:00 +00002105
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002106/*****************************************************************************/
2107
2108/*
2109 * GLOBAL(void)
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002110 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2111 * DCTELEM *workspace);
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002112 *
2113 * Note: the code uses 2 stage pipelining in order to improve instructions
2114 * scheduling and eliminate stalls (this provides ~15% better
2115 * performance for this function on both ARM Cortex-A8 and
2116 * ARM Cortex-A9 when compared to the non-pipelined variant).
2117 * The instructions which belong to the second stage use different
2118 * indentation for better readiability.
2119 */
2120asm_function jsimd_quantize_neon
2121
2122 COEF_BLOCK .req r0
2123 DIVISORS .req r1
2124 WORKSPACE .req r2
2125
2126 RECIPROCAL .req DIVISORS
2127 CORRECTION .req r3
2128 SHIFT .req ip
2129 LOOP_COUNT .req r4
2130
2131 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2132 vabs.s16 q12, q0
2133 add CORRECTION, DIVISORS, #(64 * 2)
2134 add SHIFT, DIVISORS, #(64 * 6)
2135 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2136 vabs.s16 q13, q1
2137 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002138 vadd.u16 q12, q12, q10 /* add correction */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002139 vadd.u16 q13, q13, q11
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002140 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002141 vmull.u16 q11, d25, d17
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002142 vmull.u16 q8, d26, d18
2143 vmull.u16 q9, d27, d19
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002144 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2145 vshrn.u32 d20, q10, #16
2146 vshrn.u32 d21, q11, #16
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002147 vshrn.u32 d22, q8, #16
2148 vshrn.u32 d23, q9, #16
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002149 vneg.s16 q12, q12
2150 vneg.s16 q13, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002151 vshr.s16 q2, q0, #15 /* extract sign */
2152 vshr.s16 q3, q1, #15
2153 vshl.u16 q14, q10, q12 /* shift */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002154 vshl.u16 q15, q11, q13
2155
2156 push {r4, r5}
2157 mov LOOP_COUNT, #3
21581:
2159 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2160 veor.u16 q14, q14, q2 /* restore sign */
2161 vabs.s16 q12, q0
2162 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2163 vabs.s16 q13, q1
2164 veor.u16 q15, q15, q3
2165 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002166 vadd.u16 q12, q12, q10 /* add correction */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002167 vadd.u16 q13, q13, q11
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002168 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002169 vmull.u16 q11, d25, d17
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002170 vmull.u16 q8, d26, d18
2171 vmull.u16 q9, d27, d19
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002172 vsub.u16 q14, q14, q2
2173 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2174 vsub.u16 q15, q15, q3
2175 vshrn.u32 d20, q10, #16
2176 vshrn.u32 d21, q11, #16
2177 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002178 vshrn.u32 d22, q8, #16
2179 vshrn.u32 d23, q9, #16
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002180 vneg.s16 q12, q12
2181 vneg.s16 q13, q13
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002182 vshr.s16 q2, q0, #15 /* extract sign */
2183 vshr.s16 q3, q1, #15
2184 vshl.u16 q14, q10, q12 /* shift */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002185 vshl.u16 q15, q11, q13
2186 subs LOOP_COUNT, LOOP_COUNT, #1
2187 bne 1b
2188 pop {r4, r5}
2189
2190 veor.u16 q14, q14, q2 /* restore sign */
2191 veor.u16 q15, q15, q3
2192 vsub.u16 q14, q14, q2
2193 vsub.u16 q15, q15, q3
2194 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2195
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002196 bx lr /* return */
hbono@chromium.orgc6beb742011-11-29 05:16:26 +00002197
2198 .unreq COEF_BLOCK
2199 .unreq DIVISORS
2200 .unreq WORKSPACE
2201 .unreq RECIPROCAL
2202 .unreq CORRECTION
2203 .unreq SHIFT
2204 .unreq LOOP_COUNT
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002205
noel@chromium.org3395bcc2014-04-14 06:56:00 +00002206
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002207/*****************************************************************************/
2208
2209/*
2210 * GLOBAL(void)
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002211 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2212 * JDIMENSION downsampled_width,
2213 * JSAMPARRAY input_data,
2214 * JSAMPARRAY *output_data_ptr);
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002215 *
2216 * Note: the use of unaligned writes is the main remaining bottleneck in
2217 * this code, which can be potentially solved to get up to tens
2218 * of percents performance improvement on Cortex-A8/Cortex-A9.
2219 */
2220
2221/*
2222 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2223 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2224 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2225 * Register d28 is used for multiplication by 3. Register q15 is used
2226 * for adding +1 bias.
2227 */
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002228.macro upsample16 OUTPTR, INPTR
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002229 vld1.8 {q0}, [\INPTR]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002230 vmovl.u8 q8, d0
2231 vext.8 q2, q1, q0, #15
2232 vmovl.u8 q9, d1
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002233 vaddw.u8 q10, q15, d4
2234 vaddw.u8 q11, q15, d5
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002235 vmlal.u8 q8, d4, d28
2236 vmlal.u8 q9, d5, d28
2237 vmlal.u8 q10, d0, d28
2238 vmlal.u8 q11, d1, d28
2239 vmov q1, q0 /* backup source pixels to q1 */
2240 vrshrn.u16 d6, q8, #2
2241 vrshrn.u16 d7, q9, #2
2242 vshrn.u16 d8, q10, #2
2243 vshrn.u16 d9, q11, #2
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002244 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2245.endm
2246
2247/*
2248 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2249 * macro, the roles of q0 and q1 registers are reversed for even and odd
2250 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2251 * Also this unrolling allows to reorder loads and stores to compensate
2252 * multiplication latency and reduce stalls.
2253 */
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002254.macro upsample32 OUTPTR, INPTR
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002255 /* even 16 pixels group */
2256 vld1.8 {q0}, [\INPTR]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002257 vmovl.u8 q8, d0
2258 vext.8 q2, q1, q0, #15
2259 vmovl.u8 q9, d1
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002260 vaddw.u8 q10, q15, d4
2261 vaddw.u8 q11, q15, d5
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002262 vmlal.u8 q8, d4, d28
2263 vmlal.u8 q9, d5, d28
2264 vmlal.u8 q10, d0, d28
2265 vmlal.u8 q11, d1, d28
2266 /* odd 16 pixels group */
2267 vld1.8 {q1}, [\INPTR]!
2268 vrshrn.u16 d6, q8, #2
2269 vrshrn.u16 d7, q9, #2
2270 vshrn.u16 d8, q10, #2
2271 vshrn.u16 d9, q11, #2
2272 vmovl.u8 q8, d2
2273 vext.8 q2, q0, q1, #15
2274 vmovl.u8 q9, d3
2275 vaddw.u8 q10, q15, d4
2276 vaddw.u8 q11, q15, d5
2277 vmlal.u8 q8, d4, d28
2278 vmlal.u8 q9, d5, d28
2279 vmlal.u8 q10, d2, d28
2280 vmlal.u8 q11, d3, d28
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002281 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002282 vrshrn.u16 d6, q8, #2
2283 vrshrn.u16 d7, q9, #2
2284 vshrn.u16 d8, q10, #2
2285 vshrn.u16 d9, q11, #2
2286 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002287.endm
2288
2289/*
2290 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2291 */
2292.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2293 /* special case for the first and last pixels */
2294 sub \WIDTH, \WIDTH, #1
2295 add \OUTPTR, \OUTPTR, #1
2296 ldrb \TMP1, [\INPTR, \WIDTH]
2297 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2298 ldrb \TMP1, [\INPTR], #1
2299 strb \TMP1, [\OUTPTR, #-1]
2300 vmov.8 d3[7], \TMP1
2301
2302 subs \WIDTH, \WIDTH, #32
2303 blt 5f
23040: /* process 32 pixels per iteration */
2305 upsample32 \OUTPTR, \INPTR
2306 subs \WIDTH, \WIDTH, #32
2307 bge 0b
23085:
2309 adds \WIDTH, \WIDTH, #16
2310 blt 1f
23110: /* process 16 pixels if needed */
2312 upsample16 \OUTPTR, \INPTR
2313 subs \WIDTH, \WIDTH, #16
23141:
2315 adds \WIDTH, \WIDTH, #16
2316 beq 9f
2317
2318 /* load the remaining 1-15 pixels */
2319 add \INPTR, \INPTR, \WIDTH
2320 tst \WIDTH, #1
2321 beq 2f
2322 sub \INPTR, \INPTR, #1
2323 vld1.8 {d0[0]}, [\INPTR]
23242:
2325 tst \WIDTH, #2
2326 beq 2f
2327 vext.8 d0, d0, d0, #6
2328 sub \INPTR, \INPTR, #1
2329 vld1.8 {d0[1]}, [\INPTR]
2330 sub \INPTR, \INPTR, #1
2331 vld1.8 {d0[0]}, [\INPTR]
23322:
2333 tst \WIDTH, #4
2334 beq 2f
2335 vrev64.32 d0, d0
2336 sub \INPTR, \INPTR, #1
2337 vld1.8 {d0[3]}, [\INPTR]
2338 sub \INPTR, \INPTR, #1
2339 vld1.8 {d0[2]}, [\INPTR]
2340 sub \INPTR, \INPTR, #1
2341 vld1.8 {d0[1]}, [\INPTR]
2342 sub \INPTR, \INPTR, #1
2343 vld1.8 {d0[0]}, [\INPTR]
23442:
2345 tst \WIDTH, #8
2346 beq 2f
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002347 vmov d1, d0
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002348 sub \INPTR, \INPTR, #8
2349 vld1.8 {d0}, [\INPTR]
23502: /* upsample the remaining pixels */
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002351 vmovl.u8 q8, d0
2352 vext.8 q2, q1, q0, #15
2353 vmovl.u8 q9, d1
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002354 vaddw.u8 q10, q15, d4
2355 vaddw.u8 q11, q15, d5
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002356 vmlal.u8 q8, d4, d28
2357 vmlal.u8 q9, d5, d28
2358 vmlal.u8 q10, d0, d28
2359 vmlal.u8 q11, d1, d28
2360 vrshrn.u16 d10, q8, #2
2361 vrshrn.u16 d12, q9, #2
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002362 vshrn.u16 d11, q10, #2
2363 vshrn.u16 d13, q11, #2
2364 vzip.8 d10, d11
2365 vzip.8 d12, d13
2366 /* store the remaining pixels */
2367 tst \WIDTH, #8
2368 beq 2f
2369 vst1.8 {d10, d11}, [\OUTPTR]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002370 vmov q5, q6
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +000023712:
2372 tst \WIDTH, #4
2373 beq 2f
2374 vst1.8 {d10}, [\OUTPTR]!
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002375 vmov d10, d11
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +000023762:
2377 tst \WIDTH, #2
2378 beq 2f
2379 vst1.8 {d10[0]}, [\OUTPTR]!
2380 vst1.8 {d10[1]}, [\OUTPTR]!
2381 vst1.8 {d10[2]}, [\OUTPTR]!
2382 vst1.8 {d10[3]}, [\OUTPTR]!
2383 vext.8 d10, d10, d10, #4
23842:
2385 tst \WIDTH, #1
2386 beq 2f
2387 vst1.8 {d10[0]}, [\OUTPTR]!
2388 vst1.8 {d10[1]}, [\OUTPTR]!
23892:
23909:
2391.endm
2392
2393asm_function jsimd_h2v1_fancy_upsample_neon
2394
2395 MAX_V_SAMP_FACTOR .req r0
2396 DOWNSAMPLED_WIDTH .req r1
2397 INPUT_DATA .req r2
2398 OUTPUT_DATA_PTR .req r3
2399 OUTPUT_DATA .req OUTPUT_DATA_PTR
2400
2401 OUTPTR .req r4
2402 INPTR .req r5
2403 WIDTH .req ip
2404 TMP .req lr
2405
2406 push {r4, r5, r6, lr}
2407 vpush {d8-d15}
2408
2409 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2410 cmp MAX_V_SAMP_FACTOR, #0
2411 ble 99f
2412
2413 /* initialize constants */
2414 vmov.u8 d28, #3
2415 vmov.u16 q15, #1
241611:
2417 ldr INPTR, [INPUT_DATA], #4
2418 ldr OUTPTR, [OUTPUT_DATA], #4
2419 mov WIDTH, DOWNSAMPLED_WIDTH
2420 upsample_row OUTPTR, INPTR, WIDTH, TMP
2421 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2422 bgt 11b
2423
242499:
2425 vpop {d8-d15}
2426 pop {r4, r5, r6, pc}
2427
2428 .unreq MAX_V_SAMP_FACTOR
2429 .unreq DOWNSAMPLED_WIDTH
2430 .unreq INPUT_DATA
2431 .unreq OUTPUT_DATA_PTR
2432 .unreq OUTPUT_DATA
2433
2434 .unreq OUTPTR
2435 .unreq INPTR
2436 .unreq WIDTH
2437 .unreq TMP
2438
hbono@chromium.orgdf5ffdd2012-05-11 07:46:03 +00002439.purgem upsample16
2440.purgem upsample32
2441.purgem upsample_row
Tom Hudson0d47d2d2016-05-04 13:22:56 -04002442
2443
2444/*****************************************************************************/
2445
2446/*
2447 * GLOBAL(JOCTET*)
2448 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2449 * JCOEFPTR block, int last_dc_val,
2450 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2451 *
2452 */
2453
2454.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2455 sub \PUT_BITS, \PUT_BITS, #0x8
2456 lsr \TMP, \PUT_BUFFER, \PUT_BITS
2457 uxtb \TMP, \TMP
2458 strb \TMP, [\BUFFER, #1]!
2459 cmp \TMP, #0xff
2460 /*it eq*/
2461 strbeq \ZERO, [\BUFFER, #1]!
2462.endm
2463
2464.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
2465 /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2466 add \PUT_BITS, \SIZE
2467 /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2468 orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
2469.endm
2470
2471.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2472 cmp \PUT_BITS, #0x10
2473 blt 15f
2474 eor \ZERO, \ZERO, \ZERO
2475 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2476 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
247715:
2478.endm
2479
2480.balign 16
2481jsimd_huff_encode_one_block_neon_consts:
2482 .byte 0x01
2483 .byte 0x02
2484 .byte 0x04
2485 .byte 0x08
2486 .byte 0x10
2487 .byte 0x20
2488 .byte 0x40
2489 .byte 0x80
2490
2491asm_function jsimd_huff_encode_one_block_neon
2492 push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2493 add r7, sp, #0x1c
2494 sub r4, sp, #0x40
2495 bfc r4, #0, #5
2496 mov sp, r4 /* align sp on 32 bytes */
2497 vst1.64 {d8, d9, d10, d11}, [r4, :128]!
2498 vst1.64 {d12, d13, d14, d15}, [r4, :128]
2499 sub sp, #0x140 /* reserve 320 bytes */
2500 str r0, [sp, #0x18] /* working state > sp + Ox18 */
2501 add r4, sp, #0x20 /* r4 = t1 */
2502 ldr lr, [r7, #0x8] /* lr = dctbl */
2503 sub r10, r1, #0x1 /* r10=buffer-- */
2504 ldrsh r1, [r2]
2505 mov r9, #0x10
2506 mov r8, #0x1
2507 adr r5, jsimd_huff_encode_one_block_neon_consts
2508 /* prepare data */
2509 vld1.8 {d26}, [r5, :64]
2510 veor q8, q8, q8
2511 veor q9, q9, q9
2512 vdup.16 q14, r9
2513 vdup.16 q15, r8
2514 veor q10, q10, q10
2515 veor q11, q11, q11
2516 sub r1, r1, r3
2517 add r9, r2, #0x22
2518 add r8, r2, #0x18
2519 add r3, r2, #0x36
2520 vmov.16 d0[0], r1
2521 vld1.16 {d2[0]}, [r9, :16]
2522 vld1.16 {d4[0]}, [r8, :16]
2523 vld1.16 {d6[0]}, [r3, :16]
2524 add r1, r2, #0x2
2525 add r9, r2, #0x30
2526 add r8, r2, #0x26
2527 add r3, r2, #0x28
2528 vld1.16 {d0[1]}, [r1, :16]
2529 vld1.16 {d2[1]}, [r9, :16]
2530 vld1.16 {d4[1]}, [r8, :16]
2531 vld1.16 {d6[1]}, [r3, :16]
2532 add r1, r2, #0x10
2533 add r9, r2, #0x40
2534 add r8, r2, #0x34
2535 add r3, r2, #0x1a
2536 vld1.16 {d0[2]}, [r1, :16]
2537 vld1.16 {d2[2]}, [r9, :16]
2538 vld1.16 {d4[2]}, [r8, :16]
2539 vld1.16 {d6[2]}, [r3, :16]
2540 add r1, r2, #0x20
2541 add r9, r2, #0x32
2542 add r8, r2, #0x42
2543 add r3, r2, #0xc
2544 vld1.16 {d0[3]}, [r1, :16]
2545 vld1.16 {d2[3]}, [r9, :16]
2546 vld1.16 {d4[3]}, [r8, :16]
2547 vld1.16 {d6[3]}, [r3, :16]
2548 add r1, r2, #0x12
2549 add r9, r2, #0x24
2550 add r8, r2, #0x50
2551 add r3, r2, #0xe
2552 vld1.16 {d1[0]}, [r1, :16]
2553 vld1.16 {d3[0]}, [r9, :16]
2554 vld1.16 {d5[0]}, [r8, :16]
2555 vld1.16 {d7[0]}, [r3, :16]
2556 add r1, r2, #0x4
2557 add r9, r2, #0x16
2558 add r8, r2, #0x60
2559 add r3, r2, #0x1c
2560 vld1.16 {d1[1]}, [r1, :16]
2561 vld1.16 {d3[1]}, [r9, :16]
2562 vld1.16 {d5[1]}, [r8, :16]
2563 vld1.16 {d7[1]}, [r3, :16]
2564 add r1, r2, #0x6
2565 add r9, r2, #0x8
2566 add r8, r2, #0x52
2567 add r3, r2, #0x2a
2568 vld1.16 {d1[2]}, [r1, :16]
2569 vld1.16 {d3[2]}, [r9, :16]
2570 vld1.16 {d5[2]}, [r8, :16]
2571 vld1.16 {d7[2]}, [r3, :16]
2572 add r1, r2, #0x14
2573 add r9, r2, #0xa
2574 add r8, r2, #0x44
2575 add r3, r2, #0x38
2576 vld1.16 {d1[3]}, [r1, :16]
2577 vld1.16 {d3[3]}, [r9, :16]
2578 vld1.16 {d5[3]}, [r8, :16]
2579 vld1.16 {d7[3]}, [r3, :16]
2580 vcgt.s16 q8, q8, q0
2581 vcgt.s16 q9, q9, q1
2582 vcgt.s16 q10, q10, q2
2583 vcgt.s16 q11, q11, q3
2584 vabs.s16 q0, q0
2585 vabs.s16 q1, q1
2586 vabs.s16 q2, q2
2587 vabs.s16 q3, q3
2588 veor q8, q8, q0
2589 veor q9, q9, q1
2590 veor q10, q10, q2
2591 veor q11, q11, q3
2592 add r9, r4, #0x20
2593 add r8, r4, #0x80
2594 add r3, r4, #0xa0
2595 vclz.i16 q0, q0
2596 vclz.i16 q1, q1
2597 vclz.i16 q2, q2
2598 vclz.i16 q3, q3
2599 vsub.i16 q0, q14, q0
2600 vsub.i16 q1, q14, q1
2601 vsub.i16 q2, q14, q2
2602 vsub.i16 q3, q14, q3
2603 vst1.16 {d0, d1, d2, d3}, [r4, :256]
2604 vst1.16 {d4, d5, d6, d7}, [r9, :256]
2605 vshl.s16 q0, q15, q0
2606 vshl.s16 q1, q15, q1
2607 vshl.s16 q2, q15, q2
2608 vshl.s16 q3, q15, q3
2609 vsub.i16 q0, q0, q15
2610 vsub.i16 q1, q1, q15
2611 vsub.i16 q2, q2, q15
2612 vsub.i16 q3, q3, q15
2613 vand q8, q8, q0
2614 vand q9, q9, q1
2615 vand q10, q10, q2
2616 vand q11, q11, q3
2617 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2618 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2619 add r1, r2, #0x46
2620 add r9, r2, #0x3a
2621 add r8, r2, #0x74
2622 add r3, r2, #0x6a
2623 vld1.16 {d8[0]}, [r1, :16]
2624 vld1.16 {d10[0]}, [r9, :16]
2625 vld1.16 {d12[0]}, [r8, :16]
2626 vld1.16 {d14[0]}, [r3, :16]
2627 veor q8, q8, q8
2628 veor q9, q9, q9
2629 veor q10, q10, q10
2630 veor q11, q11, q11
2631 add r1, r2, #0x54
2632 add r9, r2, #0x2c
2633 add r8, r2, #0x76
2634 add r3, r2, #0x78
2635 vld1.16 {d8[1]}, [r1, :16]
2636 vld1.16 {d10[1]}, [r9, :16]
2637 vld1.16 {d12[1]}, [r8, :16]
2638 vld1.16 {d14[1]}, [r3, :16]
2639 add r1, r2, #0x62
2640 add r9, r2, #0x1e
2641 add r8, r2, #0x68
2642 add r3, r2, #0x7a
2643 vld1.16 {d8[2]}, [r1, :16]
2644 vld1.16 {d10[2]}, [r9, :16]
2645 vld1.16 {d12[2]}, [r8, :16]
2646 vld1.16 {d14[2]}, [r3, :16]
2647 add r1, r2, #0x70
2648 add r9, r2, #0x2e
2649 add r8, r2, #0x5a
2650 add r3, r2, #0x6c
2651 vld1.16 {d8[3]}, [r1, :16]
2652 vld1.16 {d10[3]}, [r9, :16]
2653 vld1.16 {d12[3]}, [r8, :16]
2654 vld1.16 {d14[3]}, [r3, :16]
2655 add r1, r2, #0x72
2656 add r9, r2, #0x3c
2657 add r8, r2, #0x4c
2658 add r3, r2, #0x5e
2659 vld1.16 {d9[0]}, [r1, :16]
2660 vld1.16 {d11[0]}, [r9, :16]
2661 vld1.16 {d13[0]}, [r8, :16]
2662 vld1.16 {d15[0]}, [r3, :16]
2663 add r1, r2, #0x64
2664 add r9, r2, #0x4a
2665 add r8, r2, #0x3e
2666 add r3, r2, #0x6e
2667 vld1.16 {d9[1]}, [r1, :16]
2668 vld1.16 {d11[1]}, [r9, :16]
2669 vld1.16 {d13[1]}, [r8, :16]
2670 vld1.16 {d15[1]}, [r3, :16]
2671 add r1, r2, #0x56
2672 add r9, r2, #0x58
2673 add r8, r2, #0x4e
2674 add r3, r2, #0x7c
2675 vld1.16 {d9[2]}, [r1, :16]
2676 vld1.16 {d11[2]}, [r9, :16]
2677 vld1.16 {d13[2]}, [r8, :16]
2678 vld1.16 {d15[2]}, [r3, :16]
2679 add r1, r2, #0x48
2680 add r9, r2, #0x66
2681 add r8, r2, #0x5c
2682 add r3, r2, #0x7e
2683 vld1.16 {d9[3]}, [r1, :16]
2684 vld1.16 {d11[3]}, [r9, :16]
2685 vld1.16 {d13[3]}, [r8, :16]
2686 vld1.16 {d15[3]}, [r3, :16]
2687 vcgt.s16 q8, q8, q4
2688 vcgt.s16 q9, q9, q5
2689 vcgt.s16 q10, q10, q6
2690 vcgt.s16 q11, q11, q7
2691 vabs.s16 q4, q4
2692 vabs.s16 q5, q5
2693 vabs.s16 q6, q6
2694 vabs.s16 q7, q7
2695 veor q8, q8, q4
2696 veor q9, q9, q5
2697 veor q10, q10, q6
2698 veor q11, q11, q7
2699 add r1, r4, #0x40
2700 add r9, r4, #0x60
2701 add r8, r4, #0xc0
2702 add r3, r4, #0xe0
2703 vclz.i16 q4, q4
2704 vclz.i16 q5, q5
2705 vclz.i16 q6, q6
2706 vclz.i16 q7, q7
2707 vsub.i16 q4, q14, q4
2708 vsub.i16 q5, q14, q5
2709 vsub.i16 q6, q14, q6
2710 vsub.i16 q7, q14, q7
2711 vst1.16 {d8, d9, d10, d11}, [r1, :256]
2712 vst1.16 {d12, d13, d14, d15}, [r9, :256]
2713 vshl.s16 q4, q15, q4
2714 vshl.s16 q5, q15, q5
2715 vshl.s16 q6, q15, q6
2716 vshl.s16 q7, q15, q7
2717 vsub.i16 q4, q4, q15
2718 vsub.i16 q5, q5, q15
2719 vsub.i16 q6, q6, q15
2720 vsub.i16 q7, q7, q15
2721 vand q8, q8, q4
2722 vand q9, q9, q5
2723 vand q10, q10, q6
2724 vand q11, q11, q7
2725 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2726 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2727 ldr r12, [r7, #0xc] /* r12 = actbl */
2728 add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
2729 mov r9, r12 /* r9 = actbl */
2730 add r6, r4, #0x80 /* r6 = t2 */
2731 ldr r11, [r0, #0x8] /* r11 = put_buffer */
2732 ldr r4, [r0, #0xc] /* r4 = put_bits */
2733 ldrh r2, [r6, #-128] /* r2 = nbits */
2734 ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */
2735 ldr r0, [lr, r2, lsl #2]
2736 ldrb r5, [r1, r2]
2737 put_bits r11, r4, r0, r5
2738 checkbuf15 r10, r11, r4, r5, r0
2739 put_bits r11, r4, r3, r2
2740 checkbuf15 r10, r11, r4, r5, r0
2741 mov lr, r6 /* lr = t2 */
2742 add r5, r9, #0x400 /* r5 = actbl->ehufsi */
2743 ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
2744 veor q8, q8, q8
2745 vceq.i16 q0, q0, q8
2746 vceq.i16 q1, q1, q8
2747 vceq.i16 q2, q2, q8
2748 vceq.i16 q3, q3, q8
2749 vceq.i16 q4, q4, q8
2750 vceq.i16 q5, q5, q8
2751 vceq.i16 q6, q6, q8
2752 vceq.i16 q7, q7, q8
2753 vmovn.i16 d0, q0
2754 vmovn.i16 d2, q1
2755 vmovn.i16 d4, q2
2756 vmovn.i16 d6, q3
2757 vmovn.i16 d8, q4
2758 vmovn.i16 d10, q5
2759 vmovn.i16 d12, q6
2760 vmovn.i16 d14, q7
2761 vand d0, d0, d26
2762 vand d2, d2, d26
2763 vand d4, d4, d26
2764 vand d6, d6, d26
2765 vand d8, d8, d26
2766 vand d10, d10, d26
2767 vand d12, d12, d26
2768 vand d14, d14, d26
2769 vpadd.i8 d0, d0, d2
2770 vpadd.i8 d4, d4, d6
2771 vpadd.i8 d8, d8, d10
2772 vpadd.i8 d12, d12, d14
2773 vpadd.i8 d0, d0, d4
2774 vpadd.i8 d8, d8, d12
2775 vpadd.i8 d0, d0, d8
2776 vmov.32 r1, d0[1]
2777 vmov.32 r8, d0[0]
2778 mvn r1, r1
2779 mvn r8, r8
2780 lsrs r1, r1, #0x1
2781 rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
2782 rbit r1, r1 /* r1 = index1 */
2783 rbit r8, r8 /* r8 = index0 */
2784 ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
2785 str r1, [sp, #0x14] /* index1 > sp + 0x14 */
2786 cmp r8, #0x0
2787 beq 6f
27881:
2789 clz r2, r8
2790 add lr, lr, r2, lsl #1
2791 lsl r8, r8, r2
2792 ldrh r1, [lr, #-126]
27932:
2794 cmp r2, #0x10
2795 blt 3f
2796 sub r2, r2, #0x10
2797 put_bits r11, r4, r0, r6
2798 cmp r4, #0x10
2799 blt 2b
2800 eor r3, r3, r3
2801 emit_byte r10, r11, r4, r3, r12
2802 emit_byte r10, r11, r4, r3, r12
2803 b 2b
28043:
2805 add r2, r1, r2, lsl #4
2806 ldrh r3, [lr, #2]!
2807 ldr r12, [r9, r2, lsl #2]
2808 ldrb r2, [r5, r2]
2809 put_bits r11, r4, r12, r2
2810 checkbuf15 r10, r11, r4, r2, r12
2811 put_bits r11, r4, r3, r1
2812 checkbuf15 r10, r11, r4, r2, r12
2813 lsls r8, r8, #0x1
2814 bne 1b
28156:
2816 add r12, sp, #0x20 /* r12 = t1 */
2817 ldr r8, [sp, #0x14] /* r8 = index1 */
2818 adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
2819 cmp r8, #0x0
2820 beq 6f
2821 clz r2, r8
2822 sub r12, r12, lr
2823 lsl r8, r8, r2
2824 add r2, r2, r12, lsr #1
2825 add lr, lr, r2, lsl #1
2826 b 7f
28271:
2828 clz r2, r8
2829 add lr, lr, r2, lsl #1
2830 lsl r8, r8, r2
28317:
2832 ldrh r1, [lr, #-126]
28332:
2834 cmp r2, #0x10
2835 blt 3f
2836 sub r2, r2, #0x10
2837 put_bits r11, r4, r0, r6
2838 cmp r4, #0x10
2839 blt 2b
2840 eor r3, r3, r3
2841 emit_byte r10, r11, r4, r3, r12
2842 emit_byte r10, r11, r4, r3, r12
2843 b 2b
28443:
2845 add r2, r1, r2, lsl #4
2846 ldrh r3, [lr, #2]!
2847 ldr r12, [r9, r2, lsl #2]
2848 ldrb r2, [r5, r2]
2849 put_bits r11, r4, r12, r2
2850 checkbuf15 r10, r11, r4, r2, r12
2851 put_bits r11, r4, r3, r1
2852 checkbuf15 r10, r11, r4, r2, r12
2853 lsls r8, r8, #0x1
2854 bne 1b
28556:
2856 add r0, sp, #0x20
2857 add r0, #0xfe
2858 cmp lr, r0
2859 bhs 1f
2860 ldr r1, [r9]
2861 ldrb r0, [r5]
2862 put_bits r11, r4, r1, r0
2863 checkbuf15 r10, r11, r4, r0, r1
28641:
2865 ldr r12, [sp, #0x18]
2866 str r11, [r12, #0x8]
2867 str r4, [r12, #0xc]
2868 add r0, r10, #0x1
2869 add r4, sp, #0x140
2870 vld1.64 {d8, d9, d10, d11}, [r4, :128]!
2871 vld1.64 {d12, d13, d14, d15}, [r4, :128]
2872 sub r4, r7, #0x1c
2873 mov sp, r4
2874 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2875
2876.purgem emit_byte
2877.purgem put_bits
2878.purgem checkbuf15