blob: d236314e2530017af8ecb928d225c4f68795bda7 [file] [log] [blame]
DRCba55b2c2014-02-05 08:15:44 +00001/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
DRCbdc76502014-08-23 15:57:38 +00007 * Copyright (C) 2013-2014, Linaro Limited
DRCba55b2c2014-02-05 08:15:44 +00008 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
DRCd38b4f22016-01-16 01:53:32 -06009 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
DRCec6941f2016-01-15 09:29:11 -060010 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
DRCd38b4f22016-01-16 01:53:32 -060011 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
DRCba55b2c2014-02-05 08:15:44 +000012 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty. In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 * claim that you wrote the original software. If you use this software
23 * in a product, an acknowledgment in the product documentation would be
24 * appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 * misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
DRCcf888482016-02-02 23:17:06 -060031.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
DRCba55b2c2014-02-05 08:15:44 +000032#endif
33
34.text
DRCba55b2c2014-02-05 08:15:44 +000035
36
37#define RESPECT_STRICT_ALIGNMENT 1
38
DRCba55b2c2014-02-05 08:15:44 +000039
DRCba55b2c2014-02-05 08:15:44 +000040/*****************************************************************************/
41
42/* Supplementary macro for setting function attributes */
43.macro asm_function fname
44#ifdef __APPLE__
DRCba55b2c2014-02-05 08:15:44 +000045 .globl _\fname
46_\fname:
47#else
DRCba55b2c2014-02-05 08:15:44 +000048 .global \fname
49#ifdef __ELF__
50 .hidden \fname
51 .type \fname, %function
52#endif
53\fname:
54#endif
55.endm
56
57/* Transpose elements of single 128 bit registers */
DRCcf888482016-02-02 23:17:06 -060058.macro transpose_single x0, x1, xi, xilen, literal
59 ins \xi\xilen[0], \x0\xilen[0]
60 ins \x1\xilen[0], \x0\xilen[1]
61 trn1 \x0\literal, \x0\literal, \x1\literal
62 trn2 \x1\literal, \xi\literal, \x1\literal
DRCba55b2c2014-02-05 08:15:44 +000063.endm
64
65/* Transpose elements of 2 differnet registers */
DRCcf888482016-02-02 23:17:06 -060066.macro transpose x0, x1, xi, xilen, literal
67 mov \xi\xilen, \x0\xilen
68 trn1 \x0\literal, \x0\literal, \x1\literal
69 trn2 \x1\literal, \xi\literal, \x1\literal
DRCba55b2c2014-02-05 08:15:44 +000070.endm
71
72/* Transpose a block of 4x4 coefficients in four 64-bit registers */
DRCcf888482016-02-02 23:17:06 -060073.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
74 mov \xi\xilen, \x0\xilen
75 trn1 \x0\x0len, \x0\x0len, \x2\x2len
76 trn2 \x2\x2len, \xi\x0len, \x2\x2len
77 mov \xi\xilen, \x1\xilen
78 trn1 \x1\x1len, \x1\x1len, \x3\x3len
79 trn2 \x3\x3len, \xi\x1len, \x3\x3len
DRCba55b2c2014-02-05 08:15:44 +000080.endm
81
DRCcf888482016-02-02 23:17:06 -060082.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
83 mov \xi\xilen, \x0\xilen
84 trn1 \x0\x0len, \x0\x0len, \x1\x1len
85 trn2 \x1\x2len, \xi\x0len, \x1\x2len
86 mov \xi\xilen, \x2\xilen
87 trn1 \x2\x2len, \x2\x2len, \x3\x3len
88 trn2 \x3\x2len, \xi\x1len, \x3\x3len
DRCba55b2c2014-02-05 08:15:44 +000089.endm
90
DRCcf888482016-02-02 23:17:06 -060091.macro transpose_4x4 x0, x1, x2, x3, x5
92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
DRCba55b2c2014-02-05 08:15:44 +000094.endm
95
DRCec6941f2016-01-15 09:29:11 -060096.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
DRCcf888482016-02-02 23:17:06 -060097 trn1 \t0\().8h, \l0\().8h, \l1\().8h
98 trn1 \t1\().8h, \l2\().8h, \l3\().8h
99 trn1 \t2\().8h, \l4\().8h, \l5\().8h
100 trn1 \t3\().8h, \l6\().8h, \l7\().8h
101 trn2 \l1\().8h, \l0\().8h, \l1\().8h
102 trn2 \l3\().8h, \l2\().8h, \l3\().8h
103 trn2 \l5\().8h, \l4\().8h, \l5\().8h
104 trn2 \l7\().8h, \l6\().8h, \l7\().8h
DRCec6941f2016-01-15 09:29:11 -0600105
DRCcf888482016-02-02 23:17:06 -0600106 trn1 \l4\().4s, \t2\().4s, \t3\().4s
107 trn2 \t3\().4s, \t2\().4s, \t3\().4s
108 trn1 \t2\().4s, \t0\().4s, \t1\().4s
109 trn2 \l2\().4s, \t0\().4s, \t1\().4s
110 trn1 \t0\().4s, \l1\().4s, \l3\().4s
111 trn2 \l3\().4s, \l1\().4s, \l3\().4s
112 trn2 \t1\().4s, \l5\().4s, \l7\().4s
113 trn1 \l5\().4s, \l5\().4s, \l7\().4s
DRCec6941f2016-01-15 09:29:11 -0600114
DRCcf888482016-02-02 23:17:06 -0600115 trn2 \l6\().2d, \l2\().2d, \t3\().2d
116 trn1 \l0\().2d, \t2\().2d, \l4\().2d
117 trn1 \l1\().2d, \t0\().2d, \l5\().2d
118 trn2 \l7\().2d, \l3\().2d, \t1\().2d
119 trn1 \l2\().2d, \l2\().2d, \t3\().2d
120 trn2 \l4\().2d, \t2\().2d, \l4\().2d
121 trn1 \l3\().2d, \l3\().2d, \t1\().2d
122 trn2 \l5\().2d, \t0\().2d, \l5\().2d
DRCec6941f2016-01-15 09:29:11 -0600123.endm
124
DRCcf888482016-02-02 23:17:06 -0600125
DRCba55b2c2014-02-05 08:15:44 +0000126#define CENTERJSAMPLE 128
127
128/*****************************************************************************/
129
130/*
131 * Perform dequantization and inverse DCT on one block of coefficients.
132 *
133 * GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -0600134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
DRCba55b2c2014-02-05 08:15:44 +0000135 * JSAMPARRAY output_buf, JDIMENSION output_col)
136 */
137
DRCcf888482016-02-02 23:17:06 -0600138#define CONST_BITS 13
139#define PASS1_BITS 2
DRCba55b2c2014-02-05 08:15:44 +0000140
DRCcf888482016-02-02 23:17:06 -0600141#define F_0_298 2446 /* FIX(0.298631336) */
142#define F_0_390 3196 /* FIX(0.390180644) */
143#define F_0_541 4433 /* FIX(0.541196100) */
144#define F_0_765 6270 /* FIX(0.765366865) */
145#define F_0_899 7373 /* FIX(0.899976223) */
146#define F_1_175 9633 /* FIX(1.175875602) */
147#define F_1_501 12299 /* FIX(1.501321110) */
148#define F_1_847 15137 /* FIX(1.847759065) */
149#define F_1_961 16069 /* FIX(1.961570560) */
150#define F_2_053 16819 /* FIX(2.053119869) */
151#define F_2_562 20995 /* FIX(2.562915447) */
152#define F_3_072 25172 /* FIX(3.072711026) */
DRCba55b2c2014-02-05 08:15:44 +0000153
154.balign 16
DRC62999d72014-12-19 15:36:39 +0000155Ljsimd_idct_islow_neon_consts:
DRCcf888482016-02-02 23:17:06 -0600156 .short F_0_298
157 .short -F_0_390
158 .short F_0_541
159 .short F_0_765
160 .short - F_0_899
161 .short F_1_175
162 .short F_1_501
163 .short - F_1_847
164 .short - F_1_961
165 .short F_2_053
166 .short - F_2_562
167 .short F_3_072
168 .short 0 /* padding */
169 .short 0
170 .short 0
171 .short 0
DRCcb49bb02016-02-02 23:10:27 -0600172
173#undef F_0_298
174#undef F_0_390
175#undef F_0_541
176#undef F_0_765
177#undef F_0_899
178#undef F_1_175
179#undef F_1_501
180#undef F_1_847
181#undef F_1_961
182#undef F_2_053
183#undef F_2_562
184#undef F_3_072
185
186#define XFIX_P_0_298 v0.h[0]
187#define XFIX_N_0_390 v0.h[1]
188#define XFIX_P_0_541 v0.h[2]
189#define XFIX_P_0_765 v0.h[3]
190#define XFIX_N_0_899 v0.h[4]
191#define XFIX_P_1_175 v0.h[5]
192#define XFIX_P_1_501 v0.h[6]
193#define XFIX_N_1_847 v0.h[7]
194#define XFIX_N_1_961 v1.h[0]
195#define XFIX_P_2_053 v1.h[1]
196#define XFIX_N_2_562 v1.h[2]
197#define XFIX_P_3_072 v1.h[3]
DRCba55b2c2014-02-05 08:15:44 +0000198
199asm_function jsimd_idct_islow_neon
DRCba55b2c2014-02-05 08:15:44 +0000200 DCT_TABLE .req x0
201 COEF_BLOCK .req x1
202 OUTPUT_BUF .req x2
203 OUTPUT_COL .req x3
204 TMP1 .req x0
205 TMP2 .req x1
DRCcb49bb02016-02-02 23:10:27 -0600206 TMP3 .req x9
207 TMP4 .req x10
208 TMP5 .req x11
209 TMP6 .req x12
210 TMP7 .req x13
211 TMP8 .req x14
DRCba55b2c2014-02-05 08:15:44 +0000212
DRCcb49bb02016-02-02 23:10:27 -0600213 sub sp, sp, #64
DRC62999d72014-12-19 15:36:39 +0000214 adr x15, Ljsimd_idct_islow_neon_consts
DRCcf888482016-02-02 23:17:06 -0600215 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
DRCcb49bb02016-02-02 23:10:27 -0600216 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
DRCcf888482016-02-02 23:17:06 -0600217 ld1 {v0.8h, v1.8h}, [x15]
218 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
219 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
220 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
221 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
DRCba55b2c2014-02-05 08:15:44 +0000222
DRCcf888482016-02-02 23:17:06 -0600223 cmeq v16.8h, v3.8h, #0
224 cmeq v26.8h, v4.8h, #0
225 cmeq v27.8h, v5.8h, #0
226 cmeq v28.8h, v6.8h, #0
227 cmeq v29.8h, v7.8h, #0
228 cmeq v30.8h, v8.8h, #0
229 cmeq v31.8h, v9.8h, #0
DRCba55b2c2014-02-05 08:15:44 +0000230
DRCcf888482016-02-02 23:17:06 -0600231 and v10.16b, v16.16b, v26.16b
232 and v11.16b, v27.16b, v28.16b
233 and v12.16b, v29.16b, v30.16b
234 and v13.16b, v31.16b, v10.16b
235 and v14.16b, v11.16b, v12.16b
236 mul v2.8h, v2.8h, v18.8h
237 and v15.16b, v13.16b, v14.16b
238 shl v10.8h, v2.8h, #(PASS1_BITS)
239 sqxtn v16.8b, v15.8h
240 mov TMP1, v16.d[0]
241 sub sp, sp, #64
242 mvn TMP2, TMP1
DRCba55b2c2014-02-05 08:15:44 +0000243
DRCcf888482016-02-02 23:17:06 -0600244 cbnz TMP2, 2f
DRCcb49bb02016-02-02 23:10:27 -0600245 /* case all AC coeffs are zeros */
DRCcf888482016-02-02 23:17:06 -0600246 dup v2.2d, v10.d[0]
247 dup v6.2d, v10.d[1]
248 mov v3.16b, v2.16b
249 mov v7.16b, v6.16b
250 mov v4.16b, v2.16b
251 mov v8.16b, v6.16b
252 mov v5.16b, v2.16b
253 mov v9.16b, v6.16b
DRCcb49bb02016-02-02 23:10:27 -06002541:
255 /* for this transpose, we should organise data like this:
256 * 00, 01, 02, 03, 40, 41, 42, 43
257 * 10, 11, 12, 13, 50, 51, 52, 53
258 * 20, 21, 22, 23, 60, 61, 62, 63
259 * 30, 31, 32, 33, 70, 71, 72, 73
260 * 04, 05, 06, 07, 44, 45, 46, 47
261 * 14, 15, 16, 17, 54, 55, 56, 57
262 * 24, 25, 26, 27, 64, 65, 66, 67
263 * 34, 35, 36, 37, 74, 75, 76, 77
264 */
DRCcf888482016-02-02 23:17:06 -0600265 trn1 v28.8h, v2.8h, v3.8h
266 trn1 v29.8h, v4.8h, v5.8h
267 trn1 v30.8h, v6.8h, v7.8h
268 trn1 v31.8h, v8.8h, v9.8h
269 trn2 v16.8h, v2.8h, v3.8h
270 trn2 v17.8h, v4.8h, v5.8h
271 trn2 v18.8h, v6.8h, v7.8h
272 trn2 v19.8h, v8.8h, v9.8h
273 trn1 v2.4s, v28.4s, v29.4s
274 trn1 v6.4s, v30.4s, v31.4s
275 trn1 v3.4s, v16.4s, v17.4s
276 trn1 v7.4s, v18.4s, v19.4s
277 trn2 v4.4s, v28.4s, v29.4s
278 trn2 v8.4s, v30.4s, v31.4s
279 trn2 v5.4s, v16.4s, v17.4s
280 trn2 v9.4s, v18.4s, v19.4s
DRCcb49bb02016-02-02 23:10:27 -0600281 /* Even part: reverse the even part of the forward DCT. */
DRCcf888482016-02-02 23:17:06 -0600282 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
283 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
284 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
285 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
286 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
287 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
288 mov v21.16b, v19.16b /* tmp3 = z1 */
289 mov v20.16b, v18.16b /* tmp3 = z1 */
290 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
291 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
292 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
293 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
294 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
295 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
296 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
297 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
298 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
299 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
300 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
301 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
302 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
303 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
304 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
DRCba55b2c2014-02-05 08:15:44 +0000305
DRCcb49bb02016-02-02 23:10:27 -0600306 /* Odd part per figure 8; the matrix is unitary and hence its
307 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
308 */
DRCba55b2c2014-02-05 08:15:44 +0000309
DRCcf888482016-02-02 23:17:06 -0600310 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
311 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
312 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
313 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
314 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
DRCba55b2c2014-02-05 08:15:44 +0000315
DRCcf888482016-02-02 23:17:06 -0600316 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
317 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
318 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
319 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
320 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
321 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
322 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
323 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
324 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600325
DRCcf888482016-02-02 23:17:06 -0600326 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
327 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
328 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
329 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
330 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
331 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
332 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
333 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
334 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600335
DRCcf888482016-02-02 23:17:06 -0600336 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
337 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
338 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
339 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
DRCcb49bb02016-02-02 23:10:27 -0600340
DRCcf888482016-02-02 23:17:06 -0600341 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
342 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
343 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
344 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
345 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
346 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
347 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
348 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
DRCcb49bb02016-02-02 23:10:27 -0600349
DRCcf888482016-02-02 23:17:06 -0600350 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
351 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
352 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
353 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
354 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
355 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
356 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
357 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
DRCcb49bb02016-02-02 23:10:27 -0600358
359 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
360
DRCcf888482016-02-02 23:17:06 -0600361 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
362 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
363 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
364 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
365 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
366 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
367 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
368 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
369 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
370 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
371 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
372 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
373 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
374 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
375 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
376 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
DRCcb49bb02016-02-02 23:10:27 -0600377
DRCcf888482016-02-02 23:17:06 -0600378 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
379 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
380 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
381 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
382 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
383 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
384 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
385 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
386 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
387 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
388 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
389 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
390 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
391 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
392 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
393 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
394 movi v0.16b, #(CENTERJSAMPLE)
395 /* Prepare pointers (dual-issue with NEON instructions) */
396 ldp TMP1, TMP2, [OUTPUT_BUF], 16
397 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
398 ldp TMP3, TMP4, [OUTPUT_BUF], 16
399 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
400 add TMP1, TMP1, OUTPUT_COL
401 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
402 add TMP2, TMP2, OUTPUT_COL
403 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
404 add TMP3, TMP3, OUTPUT_COL
405 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
406 add TMP4, TMP4, OUTPUT_COL
407 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
408 ldp TMP5, TMP6, [OUTPUT_BUF], 16
409 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
410 ldp TMP7, TMP8, [OUTPUT_BUF], 16
411 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
412 add TMP5, TMP5, OUTPUT_COL
413 add v16.16b, v28.16b, v0.16b
414 add TMP6, TMP6, OUTPUT_COL
415 add v18.16b, v29.16b, v0.16b
416 add TMP7, TMP7, OUTPUT_COL
417 add v20.16b, v30.16b, v0.16b
418 add TMP8, TMP8, OUTPUT_COL
419 add v22.16b, v31.16b, v0.16b
DRCcb49bb02016-02-02 23:10:27 -0600420
421 /* Transpose the final 8-bit samples */
DRCcf888482016-02-02 23:17:06 -0600422 trn1 v28.16b, v16.16b, v18.16b
423 trn1 v30.16b, v20.16b, v22.16b
424 trn2 v29.16b, v16.16b, v18.16b
425 trn2 v31.16b, v20.16b, v22.16b
DRCcb49bb02016-02-02 23:10:27 -0600426
DRCcf888482016-02-02 23:17:06 -0600427 trn1 v16.8h, v28.8h, v30.8h
428 trn2 v18.8h, v28.8h, v30.8h
429 trn1 v20.8h, v29.8h, v31.8h
430 trn2 v22.8h, v29.8h, v31.8h
DRCcb49bb02016-02-02 23:10:27 -0600431
DRCcf888482016-02-02 23:17:06 -0600432 uzp1 v28.4s, v16.4s, v18.4s
433 uzp2 v30.4s, v16.4s, v18.4s
434 uzp1 v29.4s, v20.4s, v22.4s
435 uzp2 v31.4s, v20.4s, v22.4s
DRCcb49bb02016-02-02 23:10:27 -0600436
DRCba55b2c2014-02-05 08:15:44 +0000437 /* Store results to the output buffer */
DRCcb49bb02016-02-02 23:10:27 -0600438 st1 {v28.d}[0], [TMP1]
439 st1 {v29.d}[0], [TMP2]
440 st1 {v28.d}[1], [TMP3]
441 st1 {v29.d}[1], [TMP4]
442 st1 {v30.d}[0], [TMP5]
443 st1 {v31.d}[0], [TMP6]
444 st1 {v30.d}[1], [TMP7]
445 st1 {v31.d}[1], [TMP8]
DRCcf888482016-02-02 23:17:06 -0600446 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
DRCcb49bb02016-02-02 23:10:27 -0600447 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
DRCba55b2c2014-02-05 08:15:44 +0000448 blr x30
449
DRCcb49bb02016-02-02 23:10:27 -0600450.balign 16
4512:
DRCcf888482016-02-02 23:17:06 -0600452 mul v3.8h, v3.8h, v19.8h
453 mul v4.8h, v4.8h, v20.8h
454 mul v5.8h, v5.8h, v21.8h
455 add TMP4, xzr, TMP2, LSL #32
456 mul v6.8h, v6.8h, v22.8h
457 mul v7.8h, v7.8h, v23.8h
458 adds TMP3, xzr, TMP2, LSR #32
459 mul v8.8h, v8.8h, v24.8h
460 mul v9.8h, v9.8h, v25.8h
461 b.ne 3f
DRCcb49bb02016-02-02 23:10:27 -0600462 /* Right AC coef is zero */
463 dup v15.2d, v10.d[1]
464 /* Even part: reverse the even part of the forward DCT. */
DRCcf888482016-02-02 23:17:06 -0600465 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
466 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
467 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
468 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
469 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
470 mov v20.16b, v18.16b /* tmp3 = z1 */
471 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
472 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
473 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
474 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
475 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
476 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
477 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
DRCba55b2c2014-02-05 08:15:44 +0000478
DRCcb49bb02016-02-02 23:10:27 -0600479 /* Odd part per figure 8; the matrix is unitary and hence its
480 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
481 */
DRCba55b2c2014-02-05 08:15:44 +0000482
DRCcf888482016-02-02 23:17:06 -0600483 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
484 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
485 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
486 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
487 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
DRCba55b2c2014-02-05 08:15:44 +0000488
DRCcf888482016-02-02 23:17:06 -0600489 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
490 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
491 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
492 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
493 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
494 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
495 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
496 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
497 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600498
DRCcf888482016-02-02 23:17:06 -0600499 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
500 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
DRCcb49bb02016-02-02 23:10:27 -0600501
DRCcf888482016-02-02 23:17:06 -0600502 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
503 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
504 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
505 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
DRCcb49bb02016-02-02 23:10:27 -0600506
DRCcf888482016-02-02 23:17:06 -0600507 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
508 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
509 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
510 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
DRCcb49bb02016-02-02 23:10:27 -0600511
512 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
513
DRCcf888482016-02-02 23:17:06 -0600514 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
515 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
516 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
517 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
518 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
519 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
520 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
521 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
DRCcb49bb02016-02-02 23:10:27 -0600522
DRCcf888482016-02-02 23:17:06 -0600523 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
524 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
525 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
526 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
527 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
528 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
529 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
530 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
531 mov v6.16b, v15.16b
532 mov v7.16b, v15.16b
533 mov v8.16b, v15.16b
534 mov v9.16b, v15.16b
535 b 1b
DRCcb49bb02016-02-02 23:10:27 -0600536
537.balign 16
5383:
DRCcf888482016-02-02 23:17:06 -0600539 cbnz TMP4, 4f
DRCcb49bb02016-02-02 23:10:27 -0600540 /* Left AC coef is zero */
541 dup v14.2d, v10.d[0]
542 /* Even part: reverse the even part of the forward DCT. */
DRCcf888482016-02-02 23:17:06 -0600543 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
544 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
545 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
546 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
547 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
548 mov v21.16b, v19.16b /* tmp3 = z1 */
549 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
550 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
551 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
552 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
553 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
554 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
555 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
DRCcb49bb02016-02-02 23:10:27 -0600556
557 /* Odd part per figure 8; the matrix is unitary and hence its
558 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
559 */
560
DRCcf888482016-02-02 23:17:06 -0600561 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
562 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
563 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
564 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
565 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
DRCcb49bb02016-02-02 23:10:27 -0600566
DRCcf888482016-02-02 23:17:06 -0600567 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
568 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
569 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
570 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
571 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
572 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
573 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
574 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
575 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600576
DRCcf888482016-02-02 23:17:06 -0600577 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
578 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
579 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
580 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
DRCcb49bb02016-02-02 23:10:27 -0600581
DRCcf888482016-02-02 23:17:06 -0600582 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
583 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
584 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
585 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
DRCcb49bb02016-02-02 23:10:27 -0600586
DRCcf888482016-02-02 23:17:06 -0600587 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
588 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
589 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
590 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
DRCcb49bb02016-02-02 23:10:27 -0600591
592 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
593
DRCcf888482016-02-02 23:17:06 -0600594 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
595 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
596 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
597 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
598 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
599 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
600 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
601 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
DRCcb49bb02016-02-02 23:10:27 -0600602
DRCcf888482016-02-02 23:17:06 -0600603 mov v2.16b, v14.16b
604 mov v3.16b, v14.16b
605 mov v4.16b, v14.16b
606 mov v5.16b, v14.16b
607 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
608 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
609 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
610 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
611 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
612 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
613 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
614 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
615 b 1b
DRCcb49bb02016-02-02 23:10:27 -0600616
617.balign 16
6184:
619 /* "No" AC coef is zero */
620 /* Even part: reverse the even part of the forward DCT. */
DRCcf888482016-02-02 23:17:06 -0600621 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
622 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
623 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
624 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
625 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
626 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
627 mov v21.16b, v19.16b /* tmp3 = z1 */
628 mov v20.16b, v18.16b /* tmp3 = z1 */
629 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
630 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
631 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
632 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
633 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
634 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
635 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
636 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
637 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
638 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
639 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
640 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
641 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
642 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
643 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
DRCcb49bb02016-02-02 23:10:27 -0600644
645 /* Odd part per figure 8; the matrix is unitary and hence its
646 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
647 */
648
DRCcf888482016-02-02 23:17:06 -0600649 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
650 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
651 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
652 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
653 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
DRCcb49bb02016-02-02 23:10:27 -0600654
DRCcf888482016-02-02 23:17:06 -0600655 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
656 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
657 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
658 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
659 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
660 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
661 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
662 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
663 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600664
DRCcf888482016-02-02 23:17:06 -0600665 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
666 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
667 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
668 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
669 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
670 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
671 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
672 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
673 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
DRCcb49bb02016-02-02 23:10:27 -0600674
DRCcf888482016-02-02 23:17:06 -0600675 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
676 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
677 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
678 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
DRCcb49bb02016-02-02 23:10:27 -0600679
DRCcf888482016-02-02 23:17:06 -0600680 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
681 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
682 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
683 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
684 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
685 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
686 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
687 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
DRCcb49bb02016-02-02 23:10:27 -0600688
DRCcf888482016-02-02 23:17:06 -0600689 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
690 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
691 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
692 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
693 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
694 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
695 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
696 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
DRCcb49bb02016-02-02 23:10:27 -0600697
698 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
699
DRCcf888482016-02-02 23:17:06 -0600700 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
701 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
702 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
703 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
704 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
705 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
706 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
707 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
708 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
709 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
710 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
711 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
712 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
713 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
714 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
715 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
DRCcb49bb02016-02-02 23:10:27 -0600716
DRCcf888482016-02-02 23:17:06 -0600717 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
718 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
719 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
720 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
721 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
722 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
723 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
724 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
725 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
726 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
727 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
728 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
729 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
730 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
731 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
732 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
733 b 1b
DRCba55b2c2014-02-05 08:15:44 +0000734
735 .unreq DCT_TABLE
736 .unreq COEF_BLOCK
737 .unreq OUTPUT_BUF
738 .unreq OUTPUT_COL
739 .unreq TMP1
740 .unreq TMP2
741 .unreq TMP3
742 .unreq TMP4
DRCcb49bb02016-02-02 23:10:27 -0600743 .unreq TMP5
744 .unreq TMP6
745 .unreq TMP7
746 .unreq TMP8
DRCba55b2c2014-02-05 08:15:44 +0000747
DRCcb49bb02016-02-02 23:10:27 -0600748#undef CENTERJSAMPLE
749#undef CONST_BITS
750#undef PASS1_BITS
751#undef XFIX_P_0_298
752#undef XFIX_N_0_390
753#undef XFIX_P_0_541
754#undef XFIX_P_0_765
755#undef XFIX_N_0_899
756#undef XFIX_P_1_175
757#undef XFIX_P_1_501
758#undef XFIX_N_1_847
759#undef XFIX_N_1_961
760#undef XFIX_P_2_053
761#undef XFIX_N_2_562
762#undef XFIX_P_3_072
DRCba55b2c2014-02-05 08:15:44 +0000763
764
765/*****************************************************************************/
766
767/*
768 * jsimd_idct_ifast_neon
769 *
770 * This function contains a fast, not so accurate integer implementation of
771 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
772 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
773 * function from jidctfst.c
774 *
775 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
776 * But in ARM NEON case some extra additions are required because VQDMULH
777 * instruction can't handle the constants larger than 1. So the expressions
778 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
779 * which introduces an extra addition. Overall, there are 6 extra additions
780 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
781 */
782
DRC62999d72014-12-19 15:36:39 +0000783#define XFIX_1_082392200 v0.h[0]
784#define XFIX_1_414213562 v0.h[1]
785#define XFIX_1_847759065 v0.h[2]
786#define XFIX_2_613125930 v0.h[3]
DRCba55b2c2014-02-05 08:15:44 +0000787
788.balign 16
DRC62999d72014-12-19 15:36:39 +0000789Ljsimd_idct_ifast_neon_consts:
DRCcf888482016-02-02 23:17:06 -0600790 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
791 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
792 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
793 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
DRCba55b2c2014-02-05 08:15:44 +0000794
795asm_function jsimd_idct_ifast_neon
796
797 DCT_TABLE .req x0
798 COEF_BLOCK .req x1
799 OUTPUT_BUF .req x2
800 OUTPUT_COL .req x3
801 TMP1 .req x0
802 TMP2 .req x1
DRCcb49bb02016-02-02 23:10:27 -0600803 TMP3 .req x9
804 TMP4 .req x10
805 TMP5 .req x11
806 TMP6 .req x12
807 TMP7 .req x13
808 TMP8 .req x14
DRCba55b2c2014-02-05 08:15:44 +0000809
810 /* Load and dequantize coefficients into NEON registers
811 * with the following allocation:
812 * 0 1 2 3 | 4 5 6 7
813 * ---------+--------
DRCcf888482016-02-02 23:17:06 -0600814 * 0 | d16 | d17 ( v16.8h )
815 * 1 | d18 | d19 ( v17.8h )
DRCcb49bb02016-02-02 23:10:27 -0600816 * 2 | d20 | d21 ( v18.8h )
817 * 3 | d22 | d23 ( v19.8h )
818 * 4 | d24 | d25 ( v20.8h )
819 * 5 | d26 | d27 ( v21.8h )
820 * 6 | d28 | d29 ( v22.8h )
821 * 7 | d30 | d31 ( v23.8h )
DRCba55b2c2014-02-05 08:15:44 +0000822 */
DRC3728aa02014-07-23 14:14:14 +0000823 /* Save NEON registers used in fast IDCT */
DRCcb49bb02016-02-02 23:10:27 -0600824 adr TMP5, Ljsimd_idct_ifast_neon_consts
825 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
DRCba55b2c2014-02-05 08:15:44 +0000826 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
DRCcb49bb02016-02-02 23:10:27 -0600827 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
DRCcf888482016-02-02 23:17:06 -0600828 mul v16.8h, v16.8h, v0.8h
DRCba55b2c2014-02-05 08:15:44 +0000829 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
DRCcf888482016-02-02 23:17:06 -0600830 mul v17.8h, v17.8h, v1.8h
DRCcb49bb02016-02-02 23:10:27 -0600831 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
832 mul v18.8h, v18.8h, v2.8h
DRCba55b2c2014-02-05 08:15:44 +0000833 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
DRCcb49bb02016-02-02 23:10:27 -0600834 mul v19.8h, v19.8h, v3.8h
835 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
836 mul v20.8h, v20.8h, v0.8h
DRCba55b2c2014-02-05 08:15:44 +0000837 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
DRCcb49bb02016-02-02 23:10:27 -0600838 mul v22.8h, v22.8h, v2.8h
839 mul v21.8h, v21.8h, v1.8h
DRCcf888482016-02-02 23:17:06 -0600840 ld1 {v0.4h}, [TMP5] /* load constants */
DRCcb49bb02016-02-02 23:10:27 -0600841 mul v23.8h, v23.8h, v3.8h
DRCba55b2c2014-02-05 08:15:44 +0000842
DRCba55b2c2014-02-05 08:15:44 +0000843 /* 1-D IDCT, pass 1 */
DRCcf888482016-02-02 23:17:06 -0600844 sub v2.8h, v18.8h, v22.8h
845 add v22.8h, v18.8h, v22.8h
846 sub v1.8h, v19.8h, v21.8h
847 add v21.8h, v19.8h, v21.8h
848 sub v5.8h, v17.8h, v23.8h
849 add v23.8h, v17.8h, v23.8h
850 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
851 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
852 add v3.8h, v1.8h, v1.8h
853 sub v1.8h, v5.8h, v1.8h
854 add v18.8h, v2.8h, v4.8h
855 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
856 sub v2.8h, v23.8h, v21.8h
857 add v3.8h, v3.8h, v6.8h
858 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
859 add v1.8h, v1.8h, v4.8h
860 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
861 sub v18.8h, v18.8h, v22.8h
862 add v2.8h, v2.8h, v6.8h
863 sub v6.8h, v16.8h, v20.8h
864 add v20.8h, v16.8h, v20.8h
865 add v17.8h, v5.8h, v4.8h
866 add v5.8h, v6.8h, v18.8h
867 sub v18.8h, v6.8h, v18.8h
868 add v6.8h, v23.8h, v21.8h
869 add v16.8h, v20.8h, v22.8h
870 sub v3.8h, v6.8h, v3.8h
871 sub v20.8h, v20.8h, v22.8h
872 sub v3.8h, v3.8h, v1.8h
873 sub v1.8h, v17.8h, v1.8h
874 add v2.8h, v3.8h, v2.8h
875 sub v23.8h, v16.8h, v6.8h
876 add v1.8h, v1.8h, v2.8h
877 add v16.8h, v16.8h, v6.8h
878 add v22.8h, v5.8h, v3.8h
879 sub v17.8h, v5.8h, v3.8h
880 sub v21.8h, v18.8h, v2.8h
881 add v18.8h, v18.8h, v2.8h
882 sub v19.8h, v20.8h, v1.8h
883 add v20.8h, v20.8h, v1.8h
DRCcb49bb02016-02-02 23:10:27 -0600884 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
DRCba55b2c2014-02-05 08:15:44 +0000885 /* 1-D IDCT, pass 2 */
DRCcf888482016-02-02 23:17:06 -0600886 sub v2.8h, v18.8h, v22.8h
887 add v22.8h, v18.8h, v22.8h
888 sub v1.8h, v19.8h, v21.8h
889 add v21.8h, v19.8h, v21.8h
890 sub v5.8h, v17.8h, v23.8h
891 add v23.8h, v17.8h, v23.8h
892 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
893 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
894 add v3.8h, v1.8h, v1.8h
895 sub v1.8h, v5.8h, v1.8h
896 add v18.8h, v2.8h, v4.8h
897 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
898 sub v2.8h, v23.8h, v21.8h
899 add v3.8h, v3.8h, v6.8h
900 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
901 add v1.8h, v1.8h, v4.8h
902 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
903 sub v18.8h, v18.8h, v22.8h
904 add v2.8h, v2.8h, v6.8h
905 sub v6.8h, v16.8h, v20.8h
906 add v20.8h, v16.8h, v20.8h
907 add v17.8h, v5.8h, v4.8h
908 add v5.8h, v6.8h, v18.8h
909 sub v18.8h, v6.8h, v18.8h
910 add v6.8h, v23.8h, v21.8h
911 add v16.8h, v20.8h, v22.8h
912 sub v3.8h, v6.8h, v3.8h
913 sub v20.8h, v20.8h, v22.8h
914 sub v3.8h, v3.8h, v1.8h
915 sub v1.8h, v17.8h, v1.8h
916 add v2.8h, v3.8h, v2.8h
917 sub v23.8h, v16.8h, v6.8h
918 add v1.8h, v1.8h, v2.8h
919 add v16.8h, v16.8h, v6.8h
920 add v22.8h, v5.8h, v3.8h
921 sub v17.8h, v5.8h, v3.8h
922 sub v21.8h, v18.8h, v2.8h
923 add v18.8h, v18.8h, v2.8h
924 sub v19.8h, v20.8h, v1.8h
925 add v20.8h, v20.8h, v1.8h
DRCba55b2c2014-02-05 08:15:44 +0000926 /* Descale to 8-bit and range limit */
DRCcf888482016-02-02 23:17:06 -0600927 movi v0.16b, #0x80
DRCcb49bb02016-02-02 23:10:27 -0600928 /* Prepare pointers (dual-issue with NEON instructions) */
DRCcf888482016-02-02 23:17:06 -0600929 ldp TMP1, TMP2, [OUTPUT_BUF], 16
930 sqshrn v28.8b, v16.8h, #5
931 ldp TMP3, TMP4, [OUTPUT_BUF], 16
932 sqshrn v29.8b, v17.8h, #5
933 add TMP1, TMP1, OUTPUT_COL
934 sqshrn v30.8b, v18.8h, #5
935 add TMP2, TMP2, OUTPUT_COL
936 sqshrn v31.8b, v19.8h, #5
937 add TMP3, TMP3, OUTPUT_COL
938 sqshrn2 v28.16b, v20.8h, #5
939 add TMP4, TMP4, OUTPUT_COL
940 sqshrn2 v29.16b, v21.8h, #5
941 ldp TMP5, TMP6, [OUTPUT_BUF], 16
942 sqshrn2 v30.16b, v22.8h, #5
943 ldp TMP7, TMP8, [OUTPUT_BUF], 16
944 sqshrn2 v31.16b, v23.8h, #5
945 add TMP5, TMP5, OUTPUT_COL
946 add v16.16b, v28.16b, v0.16b
947 add TMP6, TMP6, OUTPUT_COL
948 add v18.16b, v29.16b, v0.16b
949 add TMP7, TMP7, OUTPUT_COL
950 add v20.16b, v30.16b, v0.16b
951 add TMP8, TMP8, OUTPUT_COL
952 add v22.16b, v31.16b, v0.16b
DRCcb49bb02016-02-02 23:10:27 -0600953
DRCba55b2c2014-02-05 08:15:44 +0000954 /* Transpose the final 8-bit samples */
DRCcf888482016-02-02 23:17:06 -0600955 trn1 v28.16b, v16.16b, v18.16b
956 trn1 v30.16b, v20.16b, v22.16b
957 trn2 v29.16b, v16.16b, v18.16b
958 trn2 v31.16b, v20.16b, v22.16b
DRCcb49bb02016-02-02 23:10:27 -0600959
DRCcf888482016-02-02 23:17:06 -0600960 trn1 v16.8h, v28.8h, v30.8h
961 trn2 v18.8h, v28.8h, v30.8h
962 trn1 v20.8h, v29.8h, v31.8h
963 trn2 v22.8h, v29.8h, v31.8h
DRCcb49bb02016-02-02 23:10:27 -0600964
DRCcf888482016-02-02 23:17:06 -0600965 uzp1 v28.4s, v16.4s, v18.4s
966 uzp2 v30.4s, v16.4s, v18.4s
967 uzp1 v29.4s, v20.4s, v22.4s
968 uzp2 v31.4s, v20.4s, v22.4s
DRCcb49bb02016-02-02 23:10:27 -0600969
DRCba55b2c2014-02-05 08:15:44 +0000970 /* Store results to the output buffer */
DRCcb49bb02016-02-02 23:10:27 -0600971 st1 {v28.d}[0], [TMP1]
972 st1 {v29.d}[0], [TMP2]
973 st1 {v28.d}[1], [TMP3]
974 st1 {v29.d}[1], [TMP4]
975 st1 {v30.d}[0], [TMP5]
976 st1 {v31.d}[0], [TMP6]
977 st1 {v30.d}[1], [TMP7]
978 st1 {v31.d}[1], [TMP8]
DRCba55b2c2014-02-05 08:15:44 +0000979 blr x30
980
981 .unreq DCT_TABLE
982 .unreq COEF_BLOCK
983 .unreq OUTPUT_BUF
984 .unreq OUTPUT_COL
985 .unreq TMP1
986 .unreq TMP2
987 .unreq TMP3
988 .unreq TMP4
DRCec6941f2016-01-15 09:29:11 -0600989 .unreq TMP5
DRCcb49bb02016-02-02 23:10:27 -0600990 .unreq TMP6
991 .unreq TMP7
992 .unreq TMP8
DRCba55b2c2014-02-05 08:15:44 +0000993
994
995/*****************************************************************************/
996
997/*
998 * jsimd_idct_4x4_neon
999 *
1000 * This function contains inverse-DCT code for getting reduced-size
1001 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1002 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1003 * function from jpeg-6b (jidctred.c).
1004 *
1005 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1006 * requires much less arithmetic operations and hence should be faster.
1007 * The primary purpose of this particular NEON optimized function is
1008 * bit exact compatibility with jpeg-6b.
1009 *
1010 * TODO: a bit better instructions scheduling can be achieved by expanding
1011 * idct_helper/transpose_4x4 macros and reordering instructions,
1012 * but readability will suffer somewhat.
1013 */
1014
1015#define CONST_BITS 13
1016
DRCcf888482016-02-02 23:17:06 -06001017#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1018#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1019#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1020#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1021#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1022#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1023#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1024#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1025#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1026#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1027#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1028#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1029#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1030#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
DRCba55b2c2014-02-05 08:15:44 +00001031
1032.balign 16
DRC62999d72014-12-19 15:36:39 +00001033Ljsimd_idct_4x4_neon_consts:
DRCcf888482016-02-02 23:17:06 -06001034 .short FIX_1_847759065 /* v0.h[0] */
1035 .short -FIX_0_765366865 /* v0.h[1] */
1036 .short -FIX_0_211164243 /* v0.h[2] */
1037 .short FIX_1_451774981 /* v0.h[3] */
1038 .short -FIX_2_172734803 /* d1[0] */
1039 .short FIX_1_061594337 /* d1[1] */
1040 .short -FIX_0_509795579 /* d1[2] */
1041 .short -FIX_0_601344887 /* d1[3] */
1042 .short FIX_0_899976223 /* v2.h[0] */
1043 .short FIX_2_562915447 /* v2.h[1] */
1044 .short 1 << (CONST_BITS+1) /* v2.h[2] */
1045 .short 0 /* v2.h[3] */
DRCba55b2c2014-02-05 08:15:44 +00001046
1047.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
DRCcf888482016-02-02 23:17:06 -06001048 smull v28.4s, \x4, v2.h[2]
1049 smlal v28.4s, \x8, v0.h[0]
1050 smlal v28.4s, \x14, v0.h[1]
DRCe5eaf372014-05-09 18:00:32 +00001051
DRCcf888482016-02-02 23:17:06 -06001052 smull v26.4s, \x16, v1.h[2]
1053 smlal v26.4s, \x12, v1.h[3]
1054 smlal v26.4s, \x10, v2.h[0]
1055 smlal v26.4s, \x6, v2.h[1]
DRCe5eaf372014-05-09 18:00:32 +00001056
DRCcf888482016-02-02 23:17:06 -06001057 smull v30.4s, \x4, v2.h[2]
1058 smlsl v30.4s, \x8, v0.h[0]
1059 smlsl v30.4s, \x14, v0.h[1]
DRCe5eaf372014-05-09 18:00:32 +00001060
DRCcf888482016-02-02 23:17:06 -06001061 smull v24.4s, \x16, v0.h[2]
1062 smlal v24.4s, \x12, v0.h[3]
1063 smlal v24.4s, \x10, v1.h[0]
1064 smlal v24.4s, \x6, v1.h[1]
DRCe5eaf372014-05-09 18:00:32 +00001065
DRCba55b2c2014-02-05 08:15:44 +00001066 add v20.4s, v28.4s, v26.4s
1067 sub v28.4s, v28.4s, v26.4s
1068
DRCcf888482016-02-02 23:17:06 -06001069 .if \shift > 16
DRCba55b2c2014-02-05 08:15:44 +00001070 srshr v20.4s, v20.4s, #\shift
1071 srshr v28.4s, v28.4s, #\shift
DRCcf888482016-02-02 23:17:06 -06001072 xtn \y26, v20.4s
1073 xtn \y29, v28.4s
1074 .else
1075 rshrn \y26, v20.4s, #\shift
1076 rshrn \y29, v28.4s, #\shift
1077 .endif
DRCe5eaf372014-05-09 18:00:32 +00001078
DRCba55b2c2014-02-05 08:15:44 +00001079 add v20.4s, v30.4s, v24.4s
1080 sub v30.4s, v30.4s, v24.4s
1081
DRCcf888482016-02-02 23:17:06 -06001082 .if \shift > 16
DRCba55b2c2014-02-05 08:15:44 +00001083 srshr v20.4s, v20.4s, #\shift
1084 srshr v30.4s, v30.4s, #\shift
DRCcf888482016-02-02 23:17:06 -06001085 xtn \y27, v20.4s
1086 xtn \y28, v30.4s
1087 .else
1088 rshrn \y27, v20.4s, #\shift
1089 rshrn \y28, v30.4s, #\shift
1090 .endif
DRCba55b2c2014-02-05 08:15:44 +00001091.endm
1092
1093asm_function jsimd_idct_4x4_neon
1094
1095 DCT_TABLE .req x0
1096 COEF_BLOCK .req x1
1097 OUTPUT_BUF .req x2
1098 OUTPUT_COL .req x3
1099 TMP1 .req x0
1100 TMP2 .req x1
1101 TMP3 .req x2
1102 TMP4 .req x15
1103
DRC3728aa02014-07-23 14:14:14 +00001104 /* Save all used NEON registers */
1105 sub sp, sp, 272
1106 str x15, [sp], 16
DRCba55b2c2014-02-05 08:15:44 +00001107 /* Load constants (v3.4h is just used for padding) */
DRC62999d72014-12-19 15:36:39 +00001108 adr TMP4, Ljsimd_idct_4x4_neon_consts
1109 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1110 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1111 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1112 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1113 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1114 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1115 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1116 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
DRCba55b2c2014-02-05 08:15:44 +00001117 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1118
1119 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1120 * 0 1 2 3 | 4 5 6 7
1121 * ---------+--------
1122 * 0 | v4.4h | v5.4h
1123 * 1 | v6.4h | v7.4h
1124 * 2 | v8.4h | v9.4h
1125 * 3 | v10.4h | v11.4h
1126 * 4 | - | -
1127 * 5 | v12.4h | v13.4h
1128 * 6 | v14.4h | v15.4h
1129 * 7 | v16.4h | v17.4h
1130 */
1131 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1132 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1133 add COEF_BLOCK, COEF_BLOCK, #16
1134 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1135 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1136 /* dequantize */
1137 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1138 mul v4.4h, v4.4h, v18.4h
1139 mul v5.4h, v5.4h, v19.4h
DRCcf888482016-02-02 23:17:06 -06001140 ins v4.d[1], v5.d[0] /* 128 bit q4 */
DRCba55b2c2014-02-05 08:15:44 +00001141 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1142 mul v6.4h, v6.4h, v20.4h
1143 mul v7.4h, v7.4h, v21.4h
DRCcf888482016-02-02 23:17:06 -06001144 ins v6.d[1], v7.d[0] /* 128 bit q6 */
DRCba55b2c2014-02-05 08:15:44 +00001145 mul v8.4h, v8.4h, v22.4h
1146 mul v9.4h, v9.4h, v23.4h
DRCcf888482016-02-02 23:17:06 -06001147 ins v8.d[1], v9.d[0] /* 128 bit q8 */
DRCba55b2c2014-02-05 08:15:44 +00001148 add DCT_TABLE, DCT_TABLE, #16
1149 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1150 mul v10.4h, v10.4h, v24.4h
1151 mul v11.4h, v11.4h, v25.4h
DRCcf888482016-02-02 23:17:06 -06001152 ins v10.d[1], v11.d[0] /* 128 bit q10 */
DRCba55b2c2014-02-05 08:15:44 +00001153 mul v12.4h, v12.4h, v26.4h
1154 mul v13.4h, v13.4h, v27.4h
DRCcf888482016-02-02 23:17:06 -06001155 ins v12.d[1], v13.d[0] /* 128 bit q12 */
DRCde262492014-08-29 01:49:59 +00001156 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
DRCba55b2c2014-02-05 08:15:44 +00001157 mul v14.4h, v14.4h, v28.4h
1158 mul v15.4h, v15.4h, v29.4h
DRCcf888482016-02-02 23:17:06 -06001159 ins v14.d[1], v15.d[0] /* 128 bit q14 */
DRCba55b2c2014-02-05 08:15:44 +00001160 mul v16.4h, v16.4h, v30.4h
1161 mul v17.4h, v17.4h, v31.4h
DRCcf888482016-02-02 23:17:06 -06001162 ins v16.d[1], v17.d[0] /* 128 bit q16 */
DRCba55b2c2014-02-05 08:15:44 +00001163
1164 /* Pass 1 */
DRCcf888482016-02-02 23:17:06 -06001165 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1166 v4.4h, v6.4h, v8.4h, v10.4h
DRCba55b2c2014-02-05 08:15:44 +00001167 transpose_4x4 v4, v6, v8, v10, v3
DRC62999d72014-12-19 15:36:39 +00001168 ins v10.d[1], v11.d[0]
DRCcf888482016-02-02 23:17:06 -06001169 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1170 v5.4h, v7.4h, v9.4h, v11.4h
DRCba55b2c2014-02-05 08:15:44 +00001171 transpose_4x4 v5, v7, v9, v11, v3
DRC62999d72014-12-19 15:36:39 +00001172 ins v10.d[1], v11.d[0]
DRCcf888482016-02-02 23:17:06 -06001173
DRCba55b2c2014-02-05 08:15:44 +00001174 /* Pass 2 */
DRCcf888482016-02-02 23:17:06 -06001175 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1176 v26.4h, v27.4h, v28.4h, v29.4h
DRCba55b2c2014-02-05 08:15:44 +00001177 transpose_4x4 v26, v27, v28, v29, v3
1178
1179 /* Range limit */
1180 movi v30.8h, #0x80
DRC62999d72014-12-19 15:36:39 +00001181 ins v26.d[1], v27.d[0]
1182 ins v28.d[1], v29.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001183 add v26.8h, v26.8h, v30.8h
1184 add v28.8h, v28.8h, v30.8h
1185 sqxtun v26.8b, v26.8h
1186 sqxtun v27.8b, v28.8h
1187
1188 /* Store results to the output buffer */
1189 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1190 ldp TMP3, TMP4, [OUTPUT_BUF]
1191 add TMP1, TMP1, OUTPUT_COL
1192 add TMP2, TMP2, OUTPUT_COL
1193 add TMP3, TMP3, OUTPUT_COL
1194 add TMP4, TMP4, OUTPUT_COL
1195
1196#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1197 /* We can use much less instructions on little endian systems if the
1198 * OS kernel is not configured to trap unaligned memory accesses
1199 */
1200 st1 {v26.s}[0], [TMP1], 4
1201 st1 {v27.s}[0], [TMP3], 4
1202 st1 {v26.s}[1], [TMP2], 4
1203 st1 {v27.s}[1], [TMP4], 4
1204#else
1205 st1 {v26.b}[0], [TMP1], 1
1206 st1 {v27.b}[0], [TMP3], 1
1207 st1 {v26.b}[1], [TMP1], 1
1208 st1 {v27.b}[1], [TMP3], 1
1209 st1 {v26.b}[2], [TMP1], 1
1210 st1 {v27.b}[2], [TMP3], 1
1211 st1 {v26.b}[3], [TMP1], 1
1212 st1 {v27.b}[3], [TMP3], 1
DRCe5eaf372014-05-09 18:00:32 +00001213
DRCba55b2c2014-02-05 08:15:44 +00001214 st1 {v26.b}[4], [TMP2], 1
1215 st1 {v27.b}[4], [TMP4], 1
1216 st1 {v26.b}[5], [TMP2], 1
1217 st1 {v27.b}[5], [TMP4], 1
1218 st1 {v26.b}[6], [TMP2], 1
1219 st1 {v27.b}[6], [TMP4], 1
1220 st1 {v26.b}[7], [TMP2], 1
1221 st1 {v27.b}[7], [TMP4], 1
1222#endif
1223
DRC3728aa02014-07-23 14:14:14 +00001224 /* vpop {v8.4h - v15.4h} ;not available */
1225 sub sp, sp, #272
1226 ldr x15, [sp], 16
DRC62999d72014-12-19 15:36:39 +00001227 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1228 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1229 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1230 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1231 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1232 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1233 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1234 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
DRCba55b2c2014-02-05 08:15:44 +00001235 blr x30
1236
1237 .unreq DCT_TABLE
1238 .unreq COEF_BLOCK
1239 .unreq OUTPUT_BUF
1240 .unreq OUTPUT_COL
1241 .unreq TMP1
1242 .unreq TMP2
1243 .unreq TMP3
1244 .unreq TMP4
DRCba55b2c2014-02-05 08:15:44 +00001245
1246.purgem idct_helper
1247
1248
1249/*****************************************************************************/
1250
1251/*
1252 * jsimd_idct_2x2_neon
1253 *
1254 * This function contains inverse-DCT code for getting reduced-size
1255 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1256 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1257 * function from jpeg-6b (jidctred.c).
1258 *
1259 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1260 * requires much less arithmetic operations and hence should be faster.
1261 * The primary purpose of this particular NEON optimized function is
1262 * bit exact compatibility with jpeg-6b.
1263 */
1264
1265.balign 8
DRC62999d72014-12-19 15:36:39 +00001266Ljsimd_idct_2x2_neon_consts:
DRCcf888482016-02-02 23:17:06 -06001267 .short -FIX_0_720959822 /* v14[0] */
1268 .short FIX_0_850430095 /* v14[1] */
1269 .short -FIX_1_272758580 /* v14[2] */
1270 .short FIX_3_624509785 /* v14[3] */
DRCba55b2c2014-02-05 08:15:44 +00001271
1272.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
DRCcf888482016-02-02 23:17:06 -06001273 sshll v15.4s, \x4, #15
1274 smull v26.4s, \x6, v14.h[3]
1275 smlal v26.4s, \x10, v14.h[2]
1276 smlal v26.4s, \x12, v14.h[1]
1277 smlal v26.4s, \x16, v14.h[0]
DRCba55b2c2014-02-05 08:15:44 +00001278
DRCcf888482016-02-02 23:17:06 -06001279 add v20.4s, v15.4s, v26.4s
1280 sub v15.4s, v15.4s, v26.4s
DRCba55b2c2014-02-05 08:15:44 +00001281
DRCcf888482016-02-02 23:17:06 -06001282 .if \shift > 16
1283 srshr v20.4s, v20.4s, #\shift
1284 srshr v15.4s, v15.4s, #\shift
1285 xtn \y26, v20.4s
1286 xtn \y27, v15.4s
1287 .else
1288 rshrn \y26, v20.4s, #\shift
1289 rshrn \y27, v15.4s, #\shift
1290 .endif
DRCba55b2c2014-02-05 08:15:44 +00001291.endm
1292
1293asm_function jsimd_idct_2x2_neon
1294
1295 DCT_TABLE .req x0
1296 COEF_BLOCK .req x1
1297 OUTPUT_BUF .req x2
1298 OUTPUT_COL .req x3
1299 TMP1 .req x0
1300 TMP2 .req x15
1301
DRC3728aa02014-07-23 14:14:14 +00001302 /* vpush {v8.4h - v15.4h} ; not available */
1303 sub sp, sp, 208
1304 str x15, [sp], 16
DRCba55b2c2014-02-05 08:15:44 +00001305
1306 /* Load constants */
DRC62999d72014-12-19 15:36:39 +00001307 adr TMP2, Ljsimd_idct_2x2_neon_consts
1308 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1309 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1310 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1311 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1312 st1 {v21.8b, v22.8b}, [sp], 16
1313 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1314 st1 {v30.8b, v31.8b}, [sp], 16
DRC3728aa02014-07-23 14:14:14 +00001315 ld1 {v14.4h}, [TMP2]
DRCba55b2c2014-02-05 08:15:44 +00001316
1317 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1318 * 0 1 2 3 | 4 5 6 7
1319 * ---------+--------
1320 * 0 | v4.4h | v5.4h
1321 * 1 | v6.4h | v7.4h
1322 * 2 | - | -
1323 * 3 | v10.4h | v11.4h
1324 * 4 | - | -
1325 * 5 | v12.4h | v13.4h
1326 * 6 | - | -
1327 * 7 | v16.4h | v17.4h
1328 */
1329 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1330 add COEF_BLOCK, COEF_BLOCK, #16
1331 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1332 add COEF_BLOCK, COEF_BLOCK, #16
1333 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1334 add COEF_BLOCK, COEF_BLOCK, #16
1335 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1336 /* Dequantize */
1337 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
DRCde262492014-08-29 01:49:59 +00001338 mul v4.4h, v4.4h, v18.4h
1339 mul v5.4h, v5.4h, v19.4h
DRC62999d72014-12-19 15:36:39 +00001340 ins v4.d[1], v5.d[0]
DRCde262492014-08-29 01:49:59 +00001341 mul v6.4h, v6.4h, v20.4h
1342 mul v7.4h, v7.4h, v21.4h
DRC62999d72014-12-19 15:36:39 +00001343 ins v6.d[1], v7.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001344 add DCT_TABLE, DCT_TABLE, #16
1345 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
DRCde262492014-08-29 01:49:59 +00001346 mul v10.4h, v10.4h, v24.4h
1347 mul v11.4h, v11.4h, v25.4h
DRC62999d72014-12-19 15:36:39 +00001348 ins v10.d[1], v11.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001349 add DCT_TABLE, DCT_TABLE, #16
1350 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
DRCde262492014-08-29 01:49:59 +00001351 mul v12.4h, v12.4h, v26.4h
1352 mul v13.4h, v13.4h, v27.4h
DRC62999d72014-12-19 15:36:39 +00001353 ins v12.d[1], v13.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001354 add DCT_TABLE, DCT_TABLE, #16
1355 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
DRCde262492014-08-29 01:49:59 +00001356 mul v16.4h, v16.4h, v30.4h
1357 mul v17.4h, v17.4h, v31.4h
DRC62999d72014-12-19 15:36:39 +00001358 ins v16.d[1], v17.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001359
1360 /* Pass 1 */
1361#if 0
1362 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
DRCcf888482016-02-02 23:17:06 -06001363 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
DRCba55b2c2014-02-05 08:15:44 +00001364 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
DRCcf888482016-02-02 23:17:06 -06001365 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
DRCba55b2c2014-02-05 08:15:44 +00001366#else
DRCcf888482016-02-02 23:17:06 -06001367 smull v26.4s, v6.4h, v14.h[3]
DRC62999d72014-12-19 15:36:39 +00001368 smlal v26.4s, v10.4h, v14.h[2]
1369 smlal v26.4s, v12.4h, v14.h[1]
1370 smlal v26.4s, v16.4h, v14.h[0]
DRCcf888482016-02-02 23:17:06 -06001371 smull v24.4s, v7.4h, v14.h[3]
DRC62999d72014-12-19 15:36:39 +00001372 smlal v24.4s, v11.4h, v14.h[2]
1373 smlal v24.4s, v13.4h, v14.h[1]
1374 smlal v24.4s, v17.4h, v14.h[0]
DRCcf888482016-02-02 23:17:06 -06001375 sshll v15.4s, v4.4h, #15
1376 sshll v30.4s, v5.4h, #15
DRC3728aa02014-07-23 14:14:14 +00001377 add v20.4s, v15.4s, v26.4s
1378 sub v15.4s, v15.4s, v26.4s
DRCcf888482016-02-02 23:17:06 -06001379 rshrn v4.4h, v20.4s, #13
1380 rshrn v6.4h, v15.4s, #13
DRCba55b2c2014-02-05 08:15:44 +00001381 add v20.4s, v30.4s, v24.4s
DRC3728aa02014-07-23 14:14:14 +00001382 sub v15.4s, v30.4s, v24.4s
DRCcf888482016-02-02 23:17:06 -06001383 rshrn v5.4h, v20.4s, #13
1384 rshrn v7.4h, v15.4s, #13
DRC62999d72014-12-19 15:36:39 +00001385 ins v4.d[1], v5.d[0]
1386 ins v6.d[1], v7.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001387 transpose v4, v6, v3, .16b, .8h
1388 transpose v6, v10, v3, .16b, .4s
DRC62999d72014-12-19 15:36:39 +00001389 ins v11.d[0], v10.d[1]
1390 ins v7.d[0], v6.d[1]
DRCba55b2c2014-02-05 08:15:44 +00001391#endif
1392
1393 /* Pass 2 */
1394 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1395
1396 /* Range limit */
1397 movi v30.8h, #0x80
DRC62999d72014-12-19 15:36:39 +00001398 ins v26.d[1], v27.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001399 add v26.8h, v26.8h, v30.8h
1400 sqxtun v30.8b, v26.8h
DRC62999d72014-12-19 15:36:39 +00001401 ins v26.d[0], v30.d[0]
DRCba55b2c2014-02-05 08:15:44 +00001402 sqxtun v27.8b, v26.8h
1403
1404 /* Store results to the output buffer */
1405 ldp TMP1, TMP2, [OUTPUT_BUF]
1406 add TMP1, TMP1, OUTPUT_COL
1407 add TMP2, TMP2, OUTPUT_COL
1408
1409 st1 {v26.b}[0], [TMP1], 1
1410 st1 {v27.b}[4], [TMP1], 1
1411 st1 {v26.b}[1], [TMP2], 1
1412 st1 {v27.b}[5], [TMP2], 1
1413
DRC3728aa02014-07-23 14:14:14 +00001414 sub sp, sp, #208
1415 ldr x15, [sp], 16
DRC62999d72014-12-19 15:36:39 +00001416 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1417 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1418 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1419 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1420 ld1 {v21.8b, v22.8b}, [sp], 16
1421 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1422 ld1 {v30.8b, v31.8b}, [sp], 16
DRCba55b2c2014-02-05 08:15:44 +00001423 blr x30
1424
1425 .unreq DCT_TABLE
1426 .unreq COEF_BLOCK
1427 .unreq OUTPUT_BUF
1428 .unreq OUTPUT_COL
1429 .unreq TMP1
1430 .unreq TMP2
DRCba55b2c2014-02-05 08:15:44 +00001431
1432.purgem idct_helper
1433
1434
1435/*****************************************************************************/
1436
1437/*
1438 * jsimd_ycc_extrgb_convert_neon
1439 * jsimd_ycc_extbgr_convert_neon
1440 * jsimd_ycc_extrgbx_convert_neon
1441 * jsimd_ycc_extbgrx_convert_neon
1442 * jsimd_ycc_extxbgr_convert_neon
1443 * jsimd_ycc_extxrgb_convert_neon
1444 *
1445 * Colorspace conversion YCbCr -> RGB
1446 */
1447
DRCba55b2c2014-02-05 08:15:44 +00001448.macro do_load size
DRCcf888482016-02-02 23:17:06 -06001449 .if \size == 8
1450 ld1 {v4.8b}, [U], 8
1451 ld1 {v5.8b}, [V], 8
1452 ld1 {v0.8b}, [Y], 8
1453 prfm pldl1keep, [U, #64]
1454 prfm pldl1keep, [V, #64]
1455 prfm pldl1keep, [Y, #64]
1456 .elseif \size == 4
1457 ld1 {v4.b}[0], [U], 1
1458 ld1 {v4.b}[1], [U], 1
1459 ld1 {v4.b}[2], [U], 1
1460 ld1 {v4.b}[3], [U], 1
1461 ld1 {v5.b}[0], [V], 1
1462 ld1 {v5.b}[1], [V], 1
1463 ld1 {v5.b}[2], [V], 1
1464 ld1 {v5.b}[3], [V], 1
1465 ld1 {v0.b}[0], [Y], 1
1466 ld1 {v0.b}[1], [Y], 1
1467 ld1 {v0.b}[2], [Y], 1
1468 ld1 {v0.b}[3], [Y], 1
1469 .elseif \size == 2
1470 ld1 {v4.b}[4], [U], 1
1471 ld1 {v4.b}[5], [U], 1
1472 ld1 {v5.b}[4], [V], 1
1473 ld1 {v5.b}[5], [V], 1
1474 ld1 {v0.b}[4], [Y], 1
1475 ld1 {v0.b}[5], [Y], 1
1476 .elseif \size == 1
1477 ld1 {v4.b}[6], [U], 1
1478 ld1 {v5.b}[6], [V], 1
1479 ld1 {v0.b}[6], [Y], 1
1480 .else
1481 .error unsupported macroblock size
1482 .endif
DRCba55b2c2014-02-05 08:15:44 +00001483.endm
1484
DRC46ecffa2016-02-07 22:05:56 -06001485.macro do_store bpp, size, fast_st3
DRCcf888482016-02-02 23:17:06 -06001486 .if \bpp == 24
1487 .if \size == 8
DRC46ecffa2016-02-07 22:05:56 -06001488 .if \fast_st3 == 1
1489 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1490 .else
1491 st1 {v10.b}[0], [RGB], #1
1492 st1 {v11.b}[0], [RGB], #1
1493 st1 {v12.b}[0], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001494
DRC46ecffa2016-02-07 22:05:56 -06001495 st1 {v10.b}[1], [RGB], #1
1496 st1 {v11.b}[1], [RGB], #1
1497 st1 {v12.b}[1], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001498
DRC46ecffa2016-02-07 22:05:56 -06001499 st1 {v10.b}[2], [RGB], #1
1500 st1 {v11.b}[2], [RGB], #1
1501 st1 {v12.b}[2], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001502
DRC46ecffa2016-02-07 22:05:56 -06001503 st1 {v10.b}[3], [RGB], #1
1504 st1 {v11.b}[3], [RGB], #1
1505 st1 {v12.b}[3], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001506
DRC46ecffa2016-02-07 22:05:56 -06001507 st1 {v10.b}[4], [RGB], #1
1508 st1 {v11.b}[4], [RGB], #1
1509 st1 {v12.b}[4], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001510
DRC46ecffa2016-02-07 22:05:56 -06001511 st1 {v10.b}[5], [RGB], #1
1512 st1 {v11.b}[5], [RGB], #1
1513 st1 {v12.b}[5], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001514
DRC46ecffa2016-02-07 22:05:56 -06001515 st1 {v10.b}[6], [RGB], #1
1516 st1 {v11.b}[6], [RGB], #1
1517 st1 {v12.b}[6], [RGB], #1
DRCcb49bb02016-02-02 23:10:27 -06001518
DRC46ecffa2016-02-07 22:05:56 -06001519 st1 {v10.b}[7], [RGB], #1
1520 st1 {v11.b}[7], [RGB], #1
1521 st1 {v12.b}[7], [RGB], #1
1522 .endif
DRCcf888482016-02-02 23:17:06 -06001523 .elseif \size == 4
1524 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1525 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1526 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1527 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1528 .elseif \size == 2
1529 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1530 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1531 .elseif \size == 1
1532 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1533 .else
1534 .error unsupported macroblock size
DRCba55b2c2014-02-05 08:15:44 +00001535 .endif
DRCcf888482016-02-02 23:17:06 -06001536 .elseif \bpp == 32
1537 .if \size == 8
1538 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1539 .elseif \size == 4
1540 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1541 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1542 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1543 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1544 .elseif \size == 2
1545 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1546 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1547 .elseif \size == 1
1548 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1549 .else
1550 .error unsupported macroblock size
1551 .endif
1552 .elseif \bpp==16
1553 .if \size == 8
1554 st1 {v25.8h}, [RGB], 16
1555 .elseif \size == 4
1556 st1 {v25.4h}, [RGB], 8
1557 .elseif \size == 2
1558 st1 {v25.h}[4], [RGB], 2
1559 st1 {v25.h}[5], [RGB], 2
1560 .elseif \size == 1
1561 st1 {v25.h}[6], [RGB], 2
1562 .else
1563 .error unsupported macroblock size
1564 .endif
1565 .else
1566 .error unsupported bpp
1567 .endif
DRCba55b2c2014-02-05 08:15:44 +00001568.endm
DRC3728aa02014-07-23 14:14:14 +00001569
DRC46ecffa2016-02-07 22:05:56 -06001570.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1571 g_offs, gsize, b_offs, bsize, \
DRC28f00bf2016-02-08 15:15:11 -06001572 defsize, fast_st3
DRC3728aa02014-07-23 14:14:14 +00001573
DRCba55b2c2014-02-05 08:15:44 +00001574/*
DRC3728aa02014-07-23 14:14:14 +00001575 * 2-stage pipelined YCbCr->RGB conversion
DRCba55b2c2014-02-05 08:15:44 +00001576 */
1577
1578.macro do_yuv_to_rgb_stage1
DRCcf888482016-02-02 23:17:06 -06001579 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1580 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1581 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1582 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1583 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1584 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1585 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1586 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1587 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1588 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
DRCba55b2c2014-02-05 08:15:44 +00001589.endm
1590
1591.macro do_yuv_to_rgb_stage2
DRCcf888482016-02-02 23:17:06 -06001592 rshrn v20.4h, v20.4s, #15
1593 rshrn2 v20.8h, v22.4s, #15
1594 rshrn v24.4h, v24.4s, #14
1595 rshrn2 v24.8h, v26.4s, #14
1596 rshrn v28.4h, v28.4s, #14
1597 rshrn2 v28.8h, v30.4s, #14
1598 uaddw v20.8h, v20.8h, v0.8b
1599 uaddw v24.8h, v24.8h, v0.8b
1600 uaddw v28.8h, v28.8h, v0.8b
1601 .if \bpp != 16
1602 sqxtun v1\g_offs\defsize, v20.8h
1603 sqxtun v1\r_offs\defsize, v24.8h
1604 sqxtun v1\b_offs\defsize, v28.8h
1605 .else
1606 sqshlu v21.8h, v20.8h, #8
1607 sqshlu v25.8h, v24.8h, #8
1608 sqshlu v29.8h, v28.8h, #8
1609 sri v25.8h, v21.8h, #5
1610 sri v25.8h, v29.8h, #11
1611 .endif
DRCba55b2c2014-02-05 08:15:44 +00001612.endm
1613
DRC46ecffa2016-02-07 22:05:56 -06001614.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
DRCcf888482016-02-02 23:17:06 -06001615 rshrn v20.4h, v20.4s, #15
1616 rshrn v24.4h, v24.4s, #14
1617 rshrn v28.4h, v28.4s, #14
1618 ld1 {v4.8b}, [U], 8
1619 rshrn2 v20.8h, v22.4s, #15
1620 rshrn2 v24.8h, v26.4s, #14
1621 rshrn2 v28.8h, v30.4s, #14
1622 ld1 {v5.8b}, [V], 8
1623 uaddw v20.8h, v20.8h, v0.8b
1624 uaddw v24.8h, v24.8h, v0.8b
1625 uaddw v28.8h, v28.8h, v0.8b
1626 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1627 sqxtun v1\g_offs\defsize, v20.8h
1628 ld1 {v0.8b}, [Y], 8
1629 sqxtun v1\r_offs\defsize, v24.8h
1630 prfm pldl1keep, [U, #64]
1631 prfm pldl1keep, [V, #64]
1632 prfm pldl1keep, [Y, #64]
1633 sqxtun v1\b_offs\defsize, v28.8h
1634 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1635 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1636 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1637 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1638 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1639 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1640 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1641 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1642 .else /**************************** rgb565 ********************************/
1643 sqshlu v21.8h, v20.8h, #8
1644 sqshlu v25.8h, v24.8h, #8
1645 sqshlu v29.8h, v28.8h, #8
1646 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1647 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1648 ld1 {v0.8b}, [Y], 8
1649 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1650 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1651 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1652 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1653 sri v25.8h, v21.8h, #5
1654 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1655 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1656 prfm pldl1keep, [U, #64]
1657 prfm pldl1keep, [V, #64]
1658 prfm pldl1keep, [Y, #64]
1659 sri v25.8h, v29.8h, #11
1660 .endif
DRC46ecffa2016-02-07 22:05:56 -06001661 do_store \bpp, 8, \fast_st3
DRCcf888482016-02-02 23:17:06 -06001662 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1663 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
DRCba55b2c2014-02-05 08:15:44 +00001664.endm
1665
1666.macro do_yuv_to_rgb
1667 do_yuv_to_rgb_stage1
1668 do_yuv_to_rgb_stage2
1669.endm
1670
1671/* Apple gas crashes on adrl, work around that by using adr.
1672 * But this requires a copy of these constants for each function.
1673 */
1674
1675.balign 16
DRC46ecffa2016-02-07 22:05:56 -06001676.if \fast_st3 == 1
DRC62999d72014-12-19 15:36:39 +00001677Ljsimd_ycc_\colorid\()_neon_consts:
DRC46ecffa2016-02-07 22:05:56 -06001678.else
1679Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1680.endif
DRCcf888482016-02-02 23:17:06 -06001681 .short 0, 0, 0, 0
1682 .short 22971, -11277, -23401, 29033
1683 .short -128, -128, -128, -128
1684 .short -128, -128, -128, -128
DRCba55b2c2014-02-05 08:15:44 +00001685
DRC46ecffa2016-02-07 22:05:56 -06001686.if \fast_st3 == 1
DRCba55b2c2014-02-05 08:15:44 +00001687asm_function jsimd_ycc_\colorid\()_convert_neon
DRC46ecffa2016-02-07 22:05:56 -06001688.else
1689asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1690.endif
DRCba55b2c2014-02-05 08:15:44 +00001691 OUTPUT_WIDTH .req x0
1692 INPUT_BUF .req x1
1693 INPUT_ROW .req x2
1694 OUTPUT_BUF .req x3
1695 NUM_ROWS .req x4
1696
1697 INPUT_BUF0 .req x5
1698 INPUT_BUF1 .req x6
DRC62999d72014-12-19 15:36:39 +00001699 INPUT_BUF2 .req x1
DRCba55b2c2014-02-05 08:15:44 +00001700
1701 RGB .req x7
1702 Y .req x8
1703 U .req x9
1704 V .req x10
1705 N .req x15
1706
DRC3728aa02014-07-23 14:14:14 +00001707 sub sp, sp, 336
1708 str x15, [sp], 16
DRCcf888482016-02-02 23:17:06 -06001709
DRCba55b2c2014-02-05 08:15:44 +00001710 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
DRC28f00bf2016-02-08 15:15:11 -06001711 .if \fast_st3 == 1
1712 adr x15, Ljsimd_ycc_\colorid\()_neon_consts
1713 .else
1714 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1715 .endif
DRCcf888482016-02-02 23:17:06 -06001716
DRC3728aa02014-07-23 14:14:14 +00001717 /* Save NEON registers */
DRC62999d72014-12-19 15:36:39 +00001718 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1719 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1720 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1721 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1722 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1723 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1724 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1725 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
DRCba55b2c2014-02-05 08:15:44 +00001726 ld1 {v0.4h, v1.4h}, [x15], 16
1727 ld1 {v2.8h}, [x15]
1728
1729 /* Save ARM registers and handle input arguments */
1730 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
DRC3728aa02014-07-23 14:14:14 +00001731 stp x4, x5, [sp], 16
1732 stp x6, x7, [sp], 16
1733 stp x8, x9, [sp], 16
1734 stp x10, x30, [sp], 16
DRCba55b2c2014-02-05 08:15:44 +00001735 ldr INPUT_BUF0, [INPUT_BUF]
DRC62999d72014-12-19 15:36:39 +00001736 ldr INPUT_BUF1, [INPUT_BUF, #8]
1737 ldr INPUT_BUF2, [INPUT_BUF, #16]
DRCba55b2c2014-02-05 08:15:44 +00001738 .unreq INPUT_BUF
1739
DRCba55b2c2014-02-05 08:15:44 +00001740 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1741 movi v10.16b, #255
DRCde262492014-08-29 01:49:59 +00001742 movi v13.16b, #255
DRCba55b2c2014-02-05 08:15:44 +00001743
1744 /* Outer loop over scanlines */
1745 cmp NUM_ROWS, #1
DRC62999d72014-12-19 15:36:39 +00001746 b.lt 9f
DRCba55b2c2014-02-05 08:15:44 +000017470:
1748 lsl x16, INPUT_ROW, #3
1749 ldr Y, [INPUT_BUF0, x16]
1750 ldr U, [INPUT_BUF1, x16]
1751 mov N, OUTPUT_WIDTH
1752 ldr V, [INPUT_BUF2, x16]
1753 add INPUT_ROW, INPUT_ROW, #1
1754 ldr RGB, [OUTPUT_BUF], #8
1755
1756 /* Inner loop over pixels */
1757 subs N, N, #8
DRC62999d72014-12-19 15:36:39 +00001758 b.lt 3f
DRCba55b2c2014-02-05 08:15:44 +00001759 do_load 8
1760 do_yuv_to_rgb_stage1
1761 subs N, N, #8
DRC62999d72014-12-19 15:36:39 +00001762 b.lt 2f
DRCba55b2c2014-02-05 08:15:44 +000017631:
DRC46ecffa2016-02-07 22:05:56 -06001764 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
DRCba55b2c2014-02-05 08:15:44 +00001765 subs N, N, #8
DRC62999d72014-12-19 15:36:39 +00001766 b.ge 1b
DRCba55b2c2014-02-05 08:15:44 +000017672:
1768 do_yuv_to_rgb_stage2
DRC46ecffa2016-02-07 22:05:56 -06001769 do_store \bpp, 8, \fast_st3
DRCba55b2c2014-02-05 08:15:44 +00001770 tst N, #7
DRC62999d72014-12-19 15:36:39 +00001771 b.eq 8f
DRCba55b2c2014-02-05 08:15:44 +000017723:
1773 tst N, #4
DRC62999d72014-12-19 15:36:39 +00001774 b.eq 3f
DRCba55b2c2014-02-05 08:15:44 +00001775 do_load 4
17763:
1777 tst N, #2
DRC62999d72014-12-19 15:36:39 +00001778 b.eq 4f
DRCba55b2c2014-02-05 08:15:44 +00001779 do_load 2
17804:
1781 tst N, #1
DRC62999d72014-12-19 15:36:39 +00001782 b.eq 5f
DRCba55b2c2014-02-05 08:15:44 +00001783 do_load 1
17845:
1785 do_yuv_to_rgb
1786 tst N, #4
DRC62999d72014-12-19 15:36:39 +00001787 b.eq 6f
DRC46ecffa2016-02-07 22:05:56 -06001788 do_store \bpp, 4, \fast_st3
DRCba55b2c2014-02-05 08:15:44 +000017896:
1790 tst N, #2
DRC62999d72014-12-19 15:36:39 +00001791 b.eq 7f
DRC46ecffa2016-02-07 22:05:56 -06001792 do_store \bpp, 2, \fast_st3
DRCba55b2c2014-02-05 08:15:44 +000017937:
1794 tst N, #1
DRC62999d72014-12-19 15:36:39 +00001795 b.eq 8f
DRC46ecffa2016-02-07 22:05:56 -06001796 do_store \bpp, 1, \fast_st3
DRCba55b2c2014-02-05 08:15:44 +000017978:
1798 subs NUM_ROWS, NUM_ROWS, #1
DRC62999d72014-12-19 15:36:39 +00001799 b.gt 0b
DRCba55b2c2014-02-05 08:15:44 +000018009:
1801 /* Restore all registers and return */
DRC3728aa02014-07-23 14:14:14 +00001802 sub sp, sp, #336
1803 ldr x15, [sp], 16
DRC62999d72014-12-19 15:36:39 +00001804 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1805 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1808 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1809 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1810 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1811 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
DRCba55b2c2014-02-05 08:15:44 +00001812 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
DRC3728aa02014-07-23 14:14:14 +00001813 ldp x4, x5, [sp], 16
1814 ldp x6, x7, [sp], 16
1815 ldp x8, x9, [sp], 16
1816 ldp x10, x30, [sp], 16
DRCba55b2c2014-02-05 08:15:44 +00001817 br x30
1818 .unreq OUTPUT_WIDTH
1819 .unreq INPUT_ROW
1820 .unreq OUTPUT_BUF
1821 .unreq NUM_ROWS
1822 .unreq INPUT_BUF0
1823 .unreq INPUT_BUF1
1824 .unreq INPUT_BUF2
1825 .unreq RGB
1826 .unreq Y
1827 .unreq U
1828 .unreq V
1829 .unreq N
DRCba55b2c2014-02-05 08:15:44 +00001830
1831.purgem do_yuv_to_rgb
1832.purgem do_yuv_to_rgb_stage1
1833.purgem do_yuv_to_rgb_stage2
1834.purgem do_yuv_to_rgb_stage2_store_load_stage1
DRCcf888482016-02-02 23:17:06 -06001835
DRCba55b2c2014-02-05 08:15:44 +00001836.endm
1837
DRC46ecffa2016-02-07 22:05:56 -06001838/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
DRC28f00bf2016-02-08 15:15:11 -06001839generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1840generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1841generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1842generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1843generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
1844generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
1845generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
DRCcf888482016-02-02 23:17:06 -06001846
DRC46ecffa2016-02-07 22:05:56 -06001847generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
1848generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
1849
DRCba55b2c2014-02-05 08:15:44 +00001850.purgem do_load
1851.purgem do_store
DRCec6941f2016-01-15 09:29:11 -06001852
DRCcf888482016-02-02 23:17:06 -06001853
DRCec6941f2016-01-15 09:29:11 -06001854/*****************************************************************************/
1855
1856/*
1857 * jsimd_extrgb_ycc_convert_neon
1858 * jsimd_extbgr_ycc_convert_neon
1859 * jsimd_extrgbx_ycc_convert_neon
1860 * jsimd_extbgrx_ycc_convert_neon
1861 * jsimd_extxbgr_ycc_convert_neon
1862 * jsimd_extxrgb_ycc_convert_neon
1863 *
1864 * Colorspace conversion RGB -> YCbCr
1865 */
1866
1867.macro do_store size
DRCcf888482016-02-02 23:17:06 -06001868 .if \size == 8
1869 st1 {v20.8b}, [Y], #8
1870 st1 {v21.8b}, [U], #8
1871 st1 {v22.8b}, [V], #8
1872 .elseif \size == 4
1873 st1 {v20.b}[0], [Y], #1
1874 st1 {v20.b}[1], [Y], #1
1875 st1 {v20.b}[2], [Y], #1
1876 st1 {v20.b}[3], [Y], #1
1877 st1 {v21.b}[0], [U], #1
1878 st1 {v21.b}[1], [U], #1
1879 st1 {v21.b}[2], [U], #1
1880 st1 {v21.b}[3], [U], #1
1881 st1 {v22.b}[0], [V], #1
1882 st1 {v22.b}[1], [V], #1
1883 st1 {v22.b}[2], [V], #1
1884 st1 {v22.b}[3], [V], #1
1885 .elseif \size == 2
1886 st1 {v20.b}[4], [Y], #1
1887 st1 {v20.b}[5], [Y], #1
1888 st1 {v21.b}[4], [U], #1
1889 st1 {v21.b}[5], [U], #1
1890 st1 {v22.b}[4], [V], #1
1891 st1 {v22.b}[5], [V], #1
1892 .elseif \size == 1
1893 st1 {v20.b}[6], [Y], #1
1894 st1 {v21.b}[6], [U], #1
1895 st1 {v22.b}[6], [V], #1
1896 .else
1897 .error unsupported macroblock size
1898 .endif
DRCec6941f2016-01-15 09:29:11 -06001899.endm
1900
DRC46ecffa2016-02-07 22:05:56 -06001901.macro do_load bpp, size, fast_ld3
DRCcf888482016-02-02 23:17:06 -06001902 .if \bpp == 24
1903 .if \size == 8
DRC46ecffa2016-02-07 22:05:56 -06001904 .if \fast_ld3 == 1
1905 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
1906 .else
1907 ld1 {v10.b}[0], [RGB], #1
1908 ld1 {v11.b}[0], [RGB], #1
1909 ld1 {v12.b}[0], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001910
DRC46ecffa2016-02-07 22:05:56 -06001911 ld1 {v10.b}[1], [RGB], #1
1912 ld1 {v11.b}[1], [RGB], #1
1913 ld1 {v12.b}[1], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001914
DRC46ecffa2016-02-07 22:05:56 -06001915 ld1 {v10.b}[2], [RGB], #1
1916 ld1 {v11.b}[2], [RGB], #1
1917 ld1 {v12.b}[2], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001918
DRC46ecffa2016-02-07 22:05:56 -06001919 ld1 {v10.b}[3], [RGB], #1
1920 ld1 {v11.b}[3], [RGB], #1
1921 ld1 {v12.b}[3], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001922
DRC46ecffa2016-02-07 22:05:56 -06001923 ld1 {v10.b}[4], [RGB], #1
1924 ld1 {v11.b}[4], [RGB], #1
1925 ld1 {v12.b}[4], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001926
DRC46ecffa2016-02-07 22:05:56 -06001927 ld1 {v10.b}[5], [RGB], #1
1928 ld1 {v11.b}[5], [RGB], #1
1929 ld1 {v12.b}[5], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001930
DRC46ecffa2016-02-07 22:05:56 -06001931 ld1 {v10.b}[6], [RGB], #1
1932 ld1 {v11.b}[6], [RGB], #1
1933 ld1 {v12.b}[6], [RGB], #1
DRCd38b4f22016-01-16 01:53:32 -06001934
DRC46ecffa2016-02-07 22:05:56 -06001935 ld1 {v10.b}[7], [RGB], #1
1936 ld1 {v11.b}[7], [RGB], #1
1937 ld1 {v12.b}[7], [RGB], #1
1938 .endif
DRCcf888482016-02-02 23:17:06 -06001939 prfm pldl1keep, [RGB, #128]
1940 .elseif \size == 4
1941 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
1942 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
1943 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
1944 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
1945 .elseif \size == 2
1946 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
1947 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
1948 .elseif \size == 1
1949 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
DRCec6941f2016-01-15 09:29:11 -06001950 .else
DRCcf888482016-02-02 23:17:06 -06001951 .error unsupported macroblock size
DRCec6941f2016-01-15 09:29:11 -06001952 .endif
DRCcf888482016-02-02 23:17:06 -06001953 .elseif \bpp == 32
1954 .if \size == 8
1955 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1956 prfm pldl1keep, [RGB, #128]
1957 .elseif \size == 4
1958 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1959 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1960 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1961 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1962 .elseif \size == 2
1963 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1964 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1965 .elseif \size == 1
1966 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1967 .else
1968 .error unsupported macroblock size
1969 .endif
1970 .else
1971 .error unsupported bpp
1972 .endif
DRCec6941f2016-01-15 09:29:11 -06001973.endm
1974
DRC46ecffa2016-02-07 22:05:56 -06001975.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
DRC28f00bf2016-02-08 15:15:11 -06001976 b_offs, fast_ld3
DRCec6941f2016-01-15 09:29:11 -06001977
1978/*
1979 * 2-stage pipelined RGB->YCbCr conversion
1980 */
1981
1982.macro do_rgb_to_yuv_stage1
DRCcf888482016-02-02 23:17:06 -06001983 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
1984 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
1985 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
1986 rev64 v18.4s, v1.4s
1987 rev64 v26.4s, v1.4s
1988 rev64 v28.4s, v1.4s
1989 rev64 v30.4s, v1.4s
1990 umull v14.4s, v4.4h, v0.h[0]
1991 umull2 v16.4s, v4.8h, v0.h[0]
1992 umlsl v18.4s, v4.4h, v0.h[3]
1993 umlsl2 v26.4s, v4.8h, v0.h[3]
1994 umlal v28.4s, v4.4h, v0.h[5]
1995 umlal2 v30.4s, v4.8h, v0.h[5]
1996 umlal v14.4s, v6.4h, v0.h[1]
1997 umlal2 v16.4s, v6.8h, v0.h[1]
1998 umlsl v18.4s, v6.4h, v0.h[4]
1999 umlsl2 v26.4s, v6.8h, v0.h[4]
2000 umlsl v28.4s, v6.4h, v0.h[6]
2001 umlsl2 v30.4s, v6.8h, v0.h[6]
2002 umlal v14.4s, v8.4h, v0.h[2]
2003 umlal2 v16.4s, v8.8h, v0.h[2]
2004 umlal v18.4s, v8.4h, v0.h[5]
2005 umlal2 v26.4s, v8.8h, v0.h[5]
2006 umlsl v28.4s, v8.4h, v0.h[7]
2007 umlsl2 v30.4s, v8.8h, v0.h[7]
DRCec6941f2016-01-15 09:29:11 -06002008.endm
2009
2010.macro do_rgb_to_yuv_stage2
DRCcf888482016-02-02 23:17:06 -06002011 rshrn v20.4h, v14.4s, #16
2012 shrn v22.4h, v18.4s, #16
2013 shrn v24.4h, v28.4s, #16
2014 rshrn2 v20.8h, v16.4s, #16
2015 shrn2 v22.8h, v26.4s, #16
2016 shrn2 v24.8h, v30.4s, #16
2017 xtn v20.8b, v20.8h /* v20 = y */
2018 xtn v21.8b, v22.8h /* v21 = u */
2019 xtn v22.8b, v24.8h /* v22 = v */
DRCec6941f2016-01-15 09:29:11 -06002020.endm
2021
2022.macro do_rgb_to_yuv
2023 do_rgb_to_yuv_stage1
2024 do_rgb_to_yuv_stage2
2025.endm
2026
DRCd38b4f22016-01-16 01:53:32 -06002027/* TODO: expand macros and interleave instructions if some in-order
2028 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
DRC46ecffa2016-02-07 22:05:56 -06002029.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
DRCd38b4f22016-01-16 01:53:32 -06002030 do_rgb_to_yuv_stage2
DRC46ecffa2016-02-07 22:05:56 -06002031 do_load \bpp, 8, \fast_ld3
DRCcf888482016-02-02 23:17:06 -06002032 st1 {v20.8b}, [Y], #8
2033 st1 {v21.8b}, [U], #8
2034 st1 {v22.8b}, [V], #8
DRCd38b4f22016-01-16 01:53:32 -06002035 do_rgb_to_yuv_stage1
DRCec6941f2016-01-15 09:29:11 -06002036.endm
2037
2038.balign 16
DRC46ecffa2016-02-07 22:05:56 -06002039.if \fast_ld3 == 1
DRCec6941f2016-01-15 09:29:11 -06002040Ljsimd_\colorid\()_ycc_neon_consts:
DRC46ecffa2016-02-07 22:05:56 -06002041.else
2042Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2043.endif
DRCcf888482016-02-02 23:17:06 -06002044 .short 19595, 38470, 7471, 11059
2045 .short 21709, 32768, 27439, 5329
2046 .short 32767, 128, 32767, 128
2047 .short 32767, 128, 32767, 128
DRCec6941f2016-01-15 09:29:11 -06002048
DRC46ecffa2016-02-07 22:05:56 -06002049.if \fast_ld3 == 1
DRCec6941f2016-01-15 09:29:11 -06002050asm_function jsimd_\colorid\()_ycc_convert_neon
DRC46ecffa2016-02-07 22:05:56 -06002051.else
2052asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2053.endif
DRCec6941f2016-01-15 09:29:11 -06002054 OUTPUT_WIDTH .req w0
2055 INPUT_BUF .req x1
2056 OUTPUT_BUF .req x2
2057 OUTPUT_ROW .req x3
2058 NUM_ROWS .req x4
2059
2060 OUTPUT_BUF0 .req x5
2061 OUTPUT_BUF1 .req x6
2062 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
2063
2064 RGB .req x7
2065 Y .req x9
2066 U .req x10
2067 V .req x11
2068 N .req w12
2069
2070 /* Load constants to d0, d1, d2, d3 */
DRC28f00bf2016-02-08 15:15:11 -06002071 .if \fast_ld3 == 1
2072 adr x13, Ljsimd_\colorid\()_ycc_neon_consts
2073 .else
2074 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2075 .endif
DRCec6941f2016-01-15 09:29:11 -06002076 ld1 {v0.8h, v1.8h}, [x13]
2077
2078 ldr OUTPUT_BUF0, [OUTPUT_BUF]
2079 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
2080 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
2081 .unreq OUTPUT_BUF
2082
2083 /* Save NEON registers */
2084 sub sp, sp, #64
2085 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2086 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2087
2088 /* Outer loop over scanlines */
2089 cmp NUM_ROWS, #1
2090 b.lt 9f
20910:
2092 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
2093 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
2094 mov N, OUTPUT_WIDTH
2095 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
2096 add OUTPUT_ROW, OUTPUT_ROW, #1
2097 ldr RGB, [INPUT_BUF], #8
2098
2099 /* Inner loop over pixels */
2100 subs N, N, #8
2101 b.lt 3f
DRC46ecffa2016-02-07 22:05:56 -06002102 do_load \bpp, 8, \fast_ld3
DRCec6941f2016-01-15 09:29:11 -06002103 do_rgb_to_yuv_stage1
2104 subs N, N, #8
2105 b.lt 2f
21061:
DRC46ecffa2016-02-07 22:05:56 -06002107 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
DRCec6941f2016-01-15 09:29:11 -06002108 subs N, N, #8
2109 b.ge 1b
21102:
2111 do_rgb_to_yuv_stage2
2112 do_store 8
2113 tst N, #7
2114 b.eq 8f
21153:
2116 tbz N, #2, 3f
DRC46ecffa2016-02-07 22:05:56 -06002117 do_load \bpp, 4, \fast_ld3
DRCec6941f2016-01-15 09:29:11 -060021183:
2119 tbz N, #1, 4f
DRC46ecffa2016-02-07 22:05:56 -06002120 do_load \bpp, 2, \fast_ld3
DRCec6941f2016-01-15 09:29:11 -060021214:
2122 tbz N, #0, 5f
DRC46ecffa2016-02-07 22:05:56 -06002123 do_load \bpp, 1, \fast_ld3
DRCec6941f2016-01-15 09:29:11 -060021245:
2125 do_rgb_to_yuv
2126 tbz N, #2, 6f
2127 do_store 4
21286:
2129 tbz N, #1, 7f
2130 do_store 2
21317:
2132 tbz N, #0, 8f
2133 do_store 1
21348:
2135 subs NUM_ROWS, NUM_ROWS, #1
2136 b.gt 0b
21379:
2138 /* Restore all registers and return */
2139 sub sp, sp, #64
2140 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2141 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2142 br x30
2143
2144 .unreq OUTPUT_WIDTH
2145 .unreq OUTPUT_ROW
2146 .unreq INPUT_BUF
2147 .unreq NUM_ROWS
2148 .unreq OUTPUT_BUF0
2149 .unreq OUTPUT_BUF1
2150 .unreq OUTPUT_BUF2
2151 .unreq RGB
2152 .unreq Y
2153 .unreq U
2154 .unreq V
2155 .unreq N
2156
2157.purgem do_rgb_to_yuv
2158.purgem do_rgb_to_yuv_stage1
2159.purgem do_rgb_to_yuv_stage2
2160.purgem do_rgb_to_yuv_stage2_store_load_stage1
2161
2162.endm
2163
DRC46ecffa2016-02-07 22:05:56 -06002164/*--------------------------------- id ----- bpp R G B Fast LD3 */
DRC28f00bf2016-02-08 15:15:11 -06002165generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
2166generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
2167generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2168generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2169generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2170generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
DRCec6941f2016-01-15 09:29:11 -06002171
DRC46ecffa2016-02-07 22:05:56 -06002172generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
2173generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
2174
DRCec6941f2016-01-15 09:29:11 -06002175.purgem do_load
2176.purgem do_store
2177
DRCcf888482016-02-02 23:17:06 -06002178
DRCec6941f2016-01-15 09:29:11 -06002179/*****************************************************************************/
2180
2181/*
2182 * Load data into workspace, applying unsigned->signed conversion
2183 *
2184 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2185 * rid of VST1.16 instructions
2186 */
2187
2188asm_function jsimd_convsamp_neon
2189 SAMPLE_DATA .req x0
2190 START_COL .req x1
2191 WORKSPACE .req x2
2192 TMP1 .req x9
2193 TMP2 .req x10
2194 TMP3 .req x11
2195 TMP4 .req x12
2196 TMP5 .req x13
2197 TMP6 .req x14
2198 TMP7 .req x15
2199 TMP8 .req x4
2200 TMPDUP .req w3
2201
DRCec6941f2016-01-15 09:29:11 -06002202 mov TMPDUP, #128
2203 ldp TMP1, TMP2, [SAMPLE_DATA], 16
2204 ldp TMP3, TMP4, [SAMPLE_DATA], 16
2205 dup v0.8b, TMPDUP
2206 add TMP1, TMP1, START_COL
2207 add TMP2, TMP2, START_COL
2208 ldp TMP5, TMP6, [SAMPLE_DATA], 16
2209 add TMP3, TMP3, START_COL
2210 add TMP4, TMP4, START_COL
2211 ldp TMP7, TMP8, [SAMPLE_DATA], 16
2212 add TMP5, TMP5, START_COL
2213 add TMP6, TMP6, START_COL
2214 ld1 {v16.8b}, [TMP1]
2215 add TMP7, TMP7, START_COL
2216 add TMP8, TMP8, START_COL
2217 ld1 {v17.8b}, [TMP2]
2218 usubl v16.8h, v16.8b, v0.8b
2219 ld1 {v18.8b}, [TMP3]
2220 usubl v17.8h, v17.8b, v0.8b
2221 ld1 {v19.8b}, [TMP4]
2222 usubl v18.8h, v18.8b, v0.8b
2223 ld1 {v20.8b}, [TMP5]
2224 usubl v19.8h, v19.8b, v0.8b
2225 ld1 {v21.8b}, [TMP6]
2226 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2227 usubl v20.8h, v20.8b, v0.8b
2228 ld1 {v22.8b}, [TMP7]
2229 usubl v21.8h, v21.8b, v0.8b
2230 ld1 {v23.8b}, [TMP8]
2231 usubl v22.8h, v22.8b, v0.8b
2232 usubl v23.8h, v23.8b, v0.8b
2233 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2234
2235 br x30
2236
2237 .unreq SAMPLE_DATA
2238 .unreq START_COL
2239 .unreq WORKSPACE
2240 .unreq TMP1
2241 .unreq TMP2
2242 .unreq TMP3
2243 .unreq TMP4
2244 .unreq TMP5
2245 .unreq TMP6
2246 .unreq TMP7
2247 .unreq TMP8
2248 .unreq TMPDUP
2249
2250/*****************************************************************************/
2251
2252/*
2253 * jsimd_fdct_islow_neon
2254 *
2255 * This file contains a slow-but-accurate integer implementation of the
2256 * forward DCT (Discrete Cosine Transform). The following code is based
2257 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2258 * more details.
2259 *
2260 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2261 * rid of a bunch of VLD1.16 instructions
2262 */
2263
DRCcf888482016-02-02 23:17:06 -06002264#define CONST_BITS 13
2265#define PASS1_BITS 2
DRCec6941f2016-01-15 09:29:11 -06002266
DRCcf888482016-02-02 23:17:06 -06002267#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2268#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
DRCec6941f2016-01-15 09:29:11 -06002269
DRCcf888482016-02-02 23:17:06 -06002270#define F_0_298 2446 /* FIX(0.298631336) */
2271#define F_0_390 3196 /* FIX(0.390180644) */
2272#define F_0_541 4433 /* FIX(0.541196100) */
2273#define F_0_765 6270 /* FIX(0.765366865) */
2274#define F_0_899 7373 /* FIX(0.899976223) */
2275#define F_1_175 9633 /* FIX(1.175875602) */
2276#define F_1_501 12299 /* FIX(1.501321110) */
2277#define F_1_847 15137 /* FIX(1.847759065) */
2278#define F_1_961 16069 /* FIX(1.961570560) */
2279#define F_2_053 16819 /* FIX(2.053119869) */
2280#define F_2_562 20995 /* FIX(2.562915447) */
2281#define F_3_072 25172 /* FIX(3.072711026) */
DRCec6941f2016-01-15 09:29:11 -06002282
2283.balign 16
2284Ljsimd_fdct_islow_neon_consts:
DRCcf888482016-02-02 23:17:06 -06002285 .short F_0_298
2286 .short -F_0_390
2287 .short F_0_541
2288 .short F_0_765
2289 .short - F_0_899
2290 .short F_1_175
2291 .short F_1_501
2292 .short - F_1_847
2293 .short - F_1_961
2294 .short F_2_053
2295 .short - F_2_562
2296 .short F_3_072
2297 .short 0 /* padding */
2298 .short 0
2299 .short 0
2300 .short 0
DRCec6941f2016-01-15 09:29:11 -06002301
2302#undef F_0_298
2303#undef F_0_390
2304#undef F_0_541
2305#undef F_0_765
2306#undef F_0_899
2307#undef F_1_175
2308#undef F_1_501
2309#undef F_1_847
2310#undef F_1_961
2311#undef F_2_053
2312#undef F_2_562
2313#undef F_3_072
2314#define XFIX_P_0_298 v0.h[0]
2315#define XFIX_N_0_390 v0.h[1]
2316#define XFIX_P_0_541 v0.h[2]
2317#define XFIX_P_0_765 v0.h[3]
2318#define XFIX_N_0_899 v0.h[4]
2319#define XFIX_P_1_175 v0.h[5]
2320#define XFIX_P_1_501 v0.h[6]
2321#define XFIX_N_1_847 v0.h[7]
2322#define XFIX_N_1_961 v1.h[0]
2323#define XFIX_P_2_053 v1.h[1]
2324#define XFIX_N_2_562 v1.h[2]
2325#define XFIX_P_3_072 v1.h[3]
2326
2327asm_function jsimd_fdct_islow_neon
2328
DRCcf888482016-02-02 23:17:06 -06002329 DATA .req x0
2330 TMP .req x9
DRCec6941f2016-01-15 09:29:11 -06002331
2332 /* Load constants */
DRCcf888482016-02-02 23:17:06 -06002333 adr TMP, Ljsimd_fdct_islow_neon_consts
2334 ld1 {v0.8h, v1.8h}, [TMP]
DRCec6941f2016-01-15 09:29:11 -06002335
2336 /* Save NEON registers */
DRCcf888482016-02-02 23:17:06 -06002337 sub sp, sp, #64
2338 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2339 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
DRCec6941f2016-01-15 09:29:11 -06002340
2341 /* Load all DATA into NEON registers with the following allocation:
2342 * 0 1 2 3 | 4 5 6 7
2343 * ---------+--------
2344 * 0 | d16 | d17 | v16.8h
2345 * 1 | d18 | d19 | v17.8h
2346 * 2 | d20 | d21 | v18.8h
2347 * 3 | d22 | d23 | v19.8h
2348 * 4 | d24 | d25 | v20.8h
2349 * 5 | d26 | d27 | v21.8h
2350 * 6 | d28 | d29 | v22.8h
2351 * 7 | d30 | d31 | v23.8h
2352 */
2353
DRCcf888482016-02-02 23:17:06 -06002354 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2355 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2356 sub DATA, DATA, #64
DRCec6941f2016-01-15 09:29:11 -06002357
2358 /* Transpose */
DRCcf888482016-02-02 23:17:06 -06002359 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
DRCec6941f2016-01-15 09:29:11 -06002360 /* 1-D FDCT */
DRCcf888482016-02-02 23:17:06 -06002361 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2362 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2363 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2364 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2365 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2366 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2367 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2368 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
DRCec6941f2016-01-15 09:29:11 -06002369
2370 /* even part */
2371
DRCcf888482016-02-02 23:17:06 -06002372 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2373 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2374 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2375 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
DRCec6941f2016-01-15 09:29:11 -06002376
DRCcf888482016-02-02 23:17:06 -06002377 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2378 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
DRCec6941f2016-01-15 09:29:11 -06002379
DRCcf888482016-02-02 23:17:06 -06002380 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
DRCec6941f2016-01-15 09:29:11 -06002381
DRCcf888482016-02-02 23:17:06 -06002382 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2383 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002384
DRCcf888482016-02-02 23:17:06 -06002385 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2386 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2387 mov v22.16b, v18.16b
2388 mov v25.16b, v24.16b
DRCec6941f2016-01-15 09:29:11 -06002389
DRCcf888482016-02-02 23:17:06 -06002390 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2391 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2392 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2393 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
DRCec6941f2016-01-15 09:29:11 -06002394
DRCcf888482016-02-02 23:17:06 -06002395 rshrn v18.4h, v18.4s, #DESCALE_P1
2396 rshrn v22.4h, v22.4s, #DESCALE_P1
2397 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2398 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002399
2400 /* Odd part */
2401
DRCcf888482016-02-02 23:17:06 -06002402 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2403 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2404 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2405 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2406 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2407 smull2 v5.4s, v10.8h, XFIX_P_1_175
2408 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2409 smlal2 v5.4s, v11.8h, XFIX_P_1_175
DRCec6941f2016-01-15 09:29:11 -06002410
DRCcf888482016-02-02 23:17:06 -06002411 smull2 v24.4s, v28.8h, XFIX_P_0_298
2412 smull2 v25.4s, v29.8h, XFIX_P_2_053
2413 smull2 v26.4s, v30.8h, XFIX_P_3_072
2414 smull2 v27.4s, v31.8h, XFIX_P_1_501
2415 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2416 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2417 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2418 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
DRCec6941f2016-01-15 09:29:11 -06002419
DRCcf888482016-02-02 23:17:06 -06002420 smull2 v12.4s, v8.8h, XFIX_N_0_899
2421 smull2 v13.4s, v9.8h, XFIX_N_2_562
2422 smull2 v14.4s, v10.8h, XFIX_N_1_961
2423 smull2 v15.4s, v11.8h, XFIX_N_0_390
2424 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2425 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2426 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2427 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
DRCec6941f2016-01-15 09:29:11 -06002428
DRCcf888482016-02-02 23:17:06 -06002429 add v10.4s, v10.4s, v4.4s /* z3 += z5 */
2430 add v14.4s, v14.4s, v5.4s
2431 add v11.4s, v11.4s, v4.4s /* z4 += z5 */
2432 add v15.4s, v15.4s, v5.4s
DRCec6941f2016-01-15 09:29:11 -06002433
DRCcf888482016-02-02 23:17:06 -06002434 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2435 add v24.4s, v24.4s, v12.4s
2436 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2437 add v25.4s, v25.4s, v13.4s
2438 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2439 add v26.4s, v26.4s, v14.4s
2440 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2441 add v27.4s, v27.4s, v15.4s
DRCec6941f2016-01-15 09:29:11 -06002442
DRCcf888482016-02-02 23:17:06 -06002443 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2444 add v24.4s, v24.4s, v14.4s
2445 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2446 add v25.4s, v25.4s, v15.4s
2447 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2448 add v26.4s, v26.4s, v13.4s
2449 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2450 add v27.4s, v27.4s, v12.4s
DRCec6941f2016-01-15 09:29:11 -06002451
DRCcf888482016-02-02 23:17:06 -06002452 rshrn v23.4h, v28.4s, #DESCALE_P1
2453 rshrn v21.4h, v29.4s, #DESCALE_P1
2454 rshrn v19.4h, v30.4s, #DESCALE_P1
2455 rshrn v17.4h, v31.4s, #DESCALE_P1
2456 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2457 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2458 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2459 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002460
2461 /* Transpose */
DRCcf888482016-02-02 23:17:06 -06002462 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
DRCec6941f2016-01-15 09:29:11 -06002463
2464 /* 1-D FDCT */
DRCcf888482016-02-02 23:17:06 -06002465 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2466 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2467 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2468 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2469 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2470 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2471 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2472 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
DRCec6941f2016-01-15 09:29:11 -06002473
2474 /* even part */
DRCcf888482016-02-02 23:17:06 -06002475 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2476 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2477 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2478 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
DRCec6941f2016-01-15 09:29:11 -06002479
DRCcf888482016-02-02 23:17:06 -06002480 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2481 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
DRCec6941f2016-01-15 09:29:11 -06002482
DRCcf888482016-02-02 23:17:06 -06002483 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
DRCec6941f2016-01-15 09:29:11 -06002484
DRCcf888482016-02-02 23:17:06 -06002485 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
2486 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002487
DRCcf888482016-02-02 23:17:06 -06002488 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2489 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2490 mov v22.16b, v18.16b
2491 mov v25.16b, v24.16b
DRCec6941f2016-01-15 09:29:11 -06002492
DRCcf888482016-02-02 23:17:06 -06002493 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2494 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2495 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2496 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
DRCec6941f2016-01-15 09:29:11 -06002497
DRCcf888482016-02-02 23:17:06 -06002498 rshrn v18.4h, v18.4s, #DESCALE_P2
2499 rshrn v22.4h, v22.4s, #DESCALE_P2
2500 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2501 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002502
2503 /* Odd part */
DRCcf888482016-02-02 23:17:06 -06002504 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2505 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2506 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2507 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
DRCec6941f2016-01-15 09:29:11 -06002508
DRCcf888482016-02-02 23:17:06 -06002509 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2510 smull2 v5.4s, v10.8h, XFIX_P_1_175
2511 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2512 smlal2 v5.4s, v11.8h, XFIX_P_1_175
DRCec6941f2016-01-15 09:29:11 -06002513
DRCcf888482016-02-02 23:17:06 -06002514 smull2 v24.4s, v28.8h, XFIX_P_0_298
2515 smull2 v25.4s, v29.8h, XFIX_P_2_053
2516 smull2 v26.4s, v30.8h, XFIX_P_3_072
2517 smull2 v27.4s, v31.8h, XFIX_P_1_501
2518 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2519 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2520 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2521 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
DRCec6941f2016-01-15 09:29:11 -06002522
DRCcf888482016-02-02 23:17:06 -06002523 smull2 v12.4s, v8.8h, XFIX_N_0_899
2524 smull2 v13.4s, v9.8h, XFIX_N_2_562
2525 smull2 v14.4s, v10.8h, XFIX_N_1_961
2526 smull2 v15.4s, v11.8h, XFIX_N_0_390
2527 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2528 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2529 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2530 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
DRCec6941f2016-01-15 09:29:11 -06002531
DRCcf888482016-02-02 23:17:06 -06002532 add v10.4s, v10.4s, v4.4s
2533 add v14.4s, v14.4s, v5.4s
2534 add v11.4s, v11.4s, v4.4s
2535 add v15.4s, v15.4s, v5.4s
DRCec6941f2016-01-15 09:29:11 -06002536
DRCcf888482016-02-02 23:17:06 -06002537 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2538 add v24.4s, v24.4s, v12.4s
2539 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2540 add v25.4s, v25.4s, v13.4s
2541 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2542 add v26.4s, v26.4s, v14.4s
2543 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2544 add v27.4s, v27.4s, v15.4s
DRCec6941f2016-01-15 09:29:11 -06002545
DRCcf888482016-02-02 23:17:06 -06002546 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2547 add v24.4s, v24.4s, v14.4s
2548 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2549 add v25.4s, v25.4s, v15.4s
2550 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2551 add v26.4s, v26.4s, v13.4s
2552 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2553 add v27.4s, v27.4s, v12.4s
DRCec6941f2016-01-15 09:29:11 -06002554
DRCcf888482016-02-02 23:17:06 -06002555 rshrn v23.4h, v28.4s, #DESCALE_P2
2556 rshrn v21.4h, v29.4s, #DESCALE_P2
2557 rshrn v19.4h, v30.4s, #DESCALE_P2
2558 rshrn v17.4h, v31.4s, #DESCALE_P2
2559 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2560 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2561 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2562 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
DRCec6941f2016-01-15 09:29:11 -06002563
2564 /* store results */
DRCcf888482016-02-02 23:17:06 -06002565 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2566 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
DRCec6941f2016-01-15 09:29:11 -06002567
2568 /* Restore NEON registers */
2569 sub sp, sp, #64
2570 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2571 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2572
2573 br x30
2574
2575 .unreq DATA
2576 .unreq TMP
2577
2578#undef XFIX_P_0_298
2579#undef XFIX_N_0_390
2580#undef XFIX_P_0_541
2581#undef XFIX_P_0_765
2582#undef XFIX_N_0_899
2583#undef XFIX_P_1_175
2584#undef XFIX_P_1_501
2585#undef XFIX_N_1_847
2586#undef XFIX_N_1_961
2587#undef XFIX_P_2_053
2588#undef XFIX_N_2_562
2589#undef XFIX_P_3_072
2590
DRCcf888482016-02-02 23:17:06 -06002591
DRCec6941f2016-01-15 09:29:11 -06002592/*****************************************************************************/
2593
2594/*
2595 * jsimd_fdct_ifast_neon
2596 *
2597 * This function contains a fast, not so accurate integer implementation of
2598 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
DRCcf888482016-02-02 23:17:06 -06002599 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
DRCec6941f2016-01-15 09:29:11 -06002600 * function from jfdctfst.c
2601 *
2602 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2603 * rid of a bunch of VLD1.16 instructions
2604 */
2605
2606#undef XFIX_0_541196100
2607#define XFIX_0_382683433 v0.h[0]
2608#define XFIX_0_541196100 v0.h[1]
2609#define XFIX_0_707106781 v0.h[2]
2610#define XFIX_1_306562965 v0.h[3]
2611
2612.balign 16
2613Ljsimd_fdct_ifast_neon_consts:
DRCcf888482016-02-02 23:17:06 -06002614 .short (98 * 128) /* XFIX_0_382683433 */
2615 .short (139 * 128) /* XFIX_0_541196100 */
2616 .short (181 * 128) /* XFIX_0_707106781 */
2617 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
DRCec6941f2016-01-15 09:29:11 -06002618
2619asm_function jsimd_fdct_ifast_neon
2620
DRCcf888482016-02-02 23:17:06 -06002621 DATA .req x0
2622 TMP .req x9
DRCec6941f2016-01-15 09:29:11 -06002623
2624 /* Load constants */
DRCcf888482016-02-02 23:17:06 -06002625 adr TMP, Ljsimd_fdct_ifast_neon_consts
2626 ld1 {v0.4h}, [TMP]
DRCec6941f2016-01-15 09:29:11 -06002627
2628 /* Load all DATA into NEON registers with the following allocation:
2629 * 0 1 2 3 | 4 5 6 7
2630 * ---------+--------
2631 * 0 | d16 | d17 | v0.8h
2632 * 1 | d18 | d19 | q9
2633 * 2 | d20 | d21 | q10
2634 * 3 | d22 | d23 | q11
2635 * 4 | d24 | d25 | q12
2636 * 5 | d26 | d27 | q13
2637 * 6 | d28 | d29 | q14
2638 * 7 | d30 | d31 | q15
2639 */
2640
DRCcf888482016-02-02 23:17:06 -06002641 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2642 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2643 mov TMP, #2
2644 sub DATA, DATA, #64
DRCec6941f2016-01-15 09:29:11 -060026451:
2646 /* Transpose */
DRCcf888482016-02-02 23:17:06 -06002647 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2648 subs TMP, TMP, #1
DRCec6941f2016-01-15 09:29:11 -06002649 /* 1-D FDCT */
DRCcf888482016-02-02 23:17:06 -06002650 add v4.8h, v19.8h, v20.8h
2651 sub v20.8h, v19.8h, v20.8h
2652 sub v28.8h, v18.8h, v21.8h
2653 add v18.8h, v18.8h, v21.8h
2654 sub v29.8h, v17.8h, v22.8h
2655 add v17.8h, v17.8h, v22.8h
2656 sub v21.8h, v16.8h, v23.8h
2657 add v16.8h, v16.8h, v23.8h
2658 sub v6.8h, v17.8h, v18.8h
2659 sub v7.8h, v16.8h, v4.8h
2660 add v5.8h, v17.8h, v18.8h
2661 add v6.8h, v6.8h, v7.8h
2662 add v4.8h, v16.8h, v4.8h
2663 sqdmulh v6.8h, v6.8h, XFIX_0_707106781
2664 add v19.8h, v20.8h, v28.8h
2665 add v16.8h, v4.8h, v5.8h
2666 sub v20.8h, v4.8h, v5.8h
2667 add v5.8h, v28.8h, v29.8h
2668 add v29.8h, v29.8h, v21.8h
2669 sqdmulh v5.8h, v5.8h, XFIX_0_707106781
2670 sub v28.8h, v19.8h, v29.8h
2671 add v18.8h, v7.8h, v6.8h
2672 sqdmulh v28.8h, v28.8h, XFIX_0_382683433
2673 sub v22.8h, v7.8h, v6.8h
2674 sqdmulh v19.8h, v19.8h, XFIX_0_541196100
2675 sqdmulh v7.8h, v29.8h, XFIX_1_306562965
2676 add v6.8h, v21.8h, v5.8h
2677 sub v5.8h, v21.8h, v5.8h
2678 add v29.8h, v29.8h, v28.8h
2679 add v19.8h, v19.8h, v28.8h
2680 add v29.8h, v29.8h, v7.8h
2681 add v21.8h, v5.8h, v19.8h
2682 sub v19.8h, v5.8h, v19.8h
2683 add v17.8h, v6.8h, v29.8h
2684 sub v23.8h, v6.8h, v29.8h
DRCec6941f2016-01-15 09:29:11 -06002685
DRCcf888482016-02-02 23:17:06 -06002686 b.ne 1b
DRCec6941f2016-01-15 09:29:11 -06002687
2688 /* store results */
DRCcf888482016-02-02 23:17:06 -06002689 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2690 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
DRCec6941f2016-01-15 09:29:11 -06002691
2692 br x30
2693
2694 .unreq DATA
2695 .unreq TMP
2696#undef XFIX_0_382683433
2697#undef XFIX_0_541196100
2698#undef XFIX_0_707106781
2699#undef XFIX_1_306562965
2700
DRCcf888482016-02-02 23:17:06 -06002701
DRCec6941f2016-01-15 09:29:11 -06002702/*****************************************************************************/
2703
2704/*
2705 * GLOBAL(void)
DRCbd498032016-02-19 08:53:33 -06002706 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2707 * DCTELEM *workspace);
DRCec6941f2016-01-15 09:29:11 -06002708 *
2709 */
2710asm_function jsimd_quantize_neon
2711
2712 COEF_BLOCK .req x0
2713 DIVISORS .req x1
2714 WORKSPACE .req x2
2715
2716 RECIPROCAL .req DIVISORS
2717 CORRECTION .req x9
2718 SHIFT .req x10
2719 LOOP_COUNT .req x11
2720
2721 mov LOOP_COUNT, #2
2722 add CORRECTION, DIVISORS, #(64 * 2)
2723 add SHIFT, DIVISORS, #(64 * 6)
27241:
2725 subs LOOP_COUNT, LOOP_COUNT, #1
2726 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2727 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2728 abs v20.8h, v0.8h
2729 abs v21.8h, v1.8h
2730 abs v22.8h, v2.8h
2731 abs v23.8h, v3.8h
2732 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2733 add v20.8h, v20.8h, v4.8h /* add correction */
2734 add v21.8h, v21.8h, v5.8h
2735 add v22.8h, v22.8h, v6.8h
2736 add v23.8h, v23.8h, v7.8h
2737 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
2738 umull2 v16.4s, v20.8h, v28.8h
2739 umull v5.4s, v21.4h, v29.4h
2740 umull2 v17.4s, v21.8h, v29.8h
2741 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
2742 umull2 v18.4s, v22.8h, v30.8h
2743 umull v7.4s, v23.4h, v31.4h
2744 umull2 v19.4s, v23.8h, v31.8h
2745 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2746 shrn v4.4h, v4.4s, #16
2747 shrn v5.4h, v5.4s, #16
2748 shrn v6.4h, v6.4s, #16
2749 shrn v7.4h, v7.4s, #16
2750 shrn2 v4.8h, v16.4s, #16
2751 shrn2 v5.8h, v17.4s, #16
2752 shrn2 v6.8h, v18.4s, #16
2753 shrn2 v7.8h, v19.4s, #16
2754 neg v24.8h, v24.8h
2755 neg v25.8h, v25.8h
2756 neg v26.8h, v26.8h
2757 neg v27.8h, v27.8h
DRCcf888482016-02-02 23:17:06 -06002758 sshr v0.8h, v0.8h, #15 /* extract sign */
2759 sshr v1.8h, v1.8h, #15
2760 sshr v2.8h, v2.8h, #15
2761 sshr v3.8h, v3.8h, #15
DRCec6941f2016-01-15 09:29:11 -06002762 ushl v4.8h, v4.8h, v24.8h /* shift */
2763 ushl v5.8h, v5.8h, v25.8h
2764 ushl v6.8h, v6.8h, v26.8h
2765 ushl v7.8h, v7.8h, v27.8h
2766
2767 eor v4.16b, v4.16b, v0.16b /* restore sign */
2768 eor v5.16b, v5.16b, v1.16b
2769 eor v6.16b, v6.16b, v2.16b
2770 eor v7.16b, v7.16b, v3.16b
2771 sub v4.8h, v4.8h, v0.8h
2772 sub v5.8h, v5.8h, v1.8h
2773 sub v6.8h, v6.8h, v2.8h
2774 sub v7.8h, v7.8h, v3.8h
2775 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2776
2777 b.ne 1b
2778
2779 br x30 /* return */
2780
2781 .unreq COEF_BLOCK
2782 .unreq DIVISORS
2783 .unreq WORKSPACE
2784 .unreq RECIPROCAL
2785 .unreq CORRECTION
2786 .unreq SHIFT
2787 .unreq LOOP_COUNT
2788
DRCcf888482016-02-02 23:17:06 -06002789
DRCec6941f2016-01-15 09:29:11 -06002790/*****************************************************************************/
2791
2792/*
2793 * Downsample pixel values of a single component.
2794 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2795 * without smoothing.
2796 *
2797 * GLOBAL(void)
2798 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
DRCcf888482016-02-02 23:17:06 -06002799 * JDIMENSION v_samp_factor,
2800 * JDIMENSION width_blocks, JSAMPARRAY input_data,
2801 * JSAMPARRAY output_data);
DRCec6941f2016-01-15 09:29:11 -06002802 */
2803
2804.balign 16
DRCd38b4f22016-01-16 01:53:32 -06002805Ljsimd_h2_downsample_neon_consts:
DRCcf888482016-02-02 23:17:06 -06002806 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2807 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
2808 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2809 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
2810 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2811 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
2812 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2813 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
2814 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2815 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
2816 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2817 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
2818 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2819 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
2820 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2821 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
2822 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2823 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
2824 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2825 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
2826 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2827 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
2828 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2829 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
2830 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2831 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
2832 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2833 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
2834 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2835 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
2836 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2837 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
DRCec6941f2016-01-15 09:29:11 -06002838
2839asm_function jsimd_h2v1_downsample_neon
2840 IMAGE_WIDTH .req x0
2841 MAX_V_SAMP .req x1
2842 V_SAMP .req x2
2843 BLOCK_WIDTH .req x3
2844 INPUT_DATA .req x4
2845 OUTPUT_DATA .req x5
2846 OUTPTR .req x9
2847 INPTR .req x10
2848 TMP1 .req x11
2849 TMP2 .req x12
2850 TMP3 .req x13
2851 TMPDUP .req w15
2852
DRCcf888482016-02-02 23:17:06 -06002853 mov TMPDUP, #0x10000
2854 lsl TMP2, BLOCK_WIDTH, #4
2855 sub TMP2, TMP2, IMAGE_WIDTH
2856 adr TMP3, Ljsimd_h2_downsample_neon_consts
2857 add TMP3, TMP3, TMP2, lsl #4
2858 dup v16.4s, TMPDUP
2859 ld1 {v18.16b}, [TMP3]
DRCec6941f2016-01-15 09:29:11 -06002860
28611: /* row loop */
DRCcf888482016-02-02 23:17:06 -06002862 ldr INPTR, [INPUT_DATA], #8
2863 ldr OUTPTR, [OUTPUT_DATA], #8
2864 subs TMP1, BLOCK_WIDTH, #1
2865 b.eq 3f
DRCec6941f2016-01-15 09:29:11 -060028662: /* columns */
DRCcf888482016-02-02 23:17:06 -06002867 ld1 {v0.16b}, [INPTR], #16
2868 mov v4.16b, v16.16b
2869 subs TMP1, TMP1, #1
2870 uadalp v4.8h, v0.16b
2871 shrn v6.8b, v4.8h, #1
2872 st1 {v6.8b}, [OUTPTR], #8
2873 b.ne 2b
DRCec6941f2016-01-15 09:29:11 -060028743: /* last columns */
DRCcf888482016-02-02 23:17:06 -06002875 ld1 {v0.16b}, [INPTR]
2876 mov v4.16b, v16.16b
2877 subs V_SAMP, V_SAMP, #1
DRCec6941f2016-01-15 09:29:11 -06002878 /* expand right */
DRCcf888482016-02-02 23:17:06 -06002879 tbl v2.16b, {v0.16b}, v18.16b
2880 uadalp v4.8h, v2.16b
2881 shrn v6.8b, v4.8h, #1
2882 st1 {v6.8b}, [OUTPTR], #8
2883 b.ne 1b
DRCec6941f2016-01-15 09:29:11 -06002884
DRCcf888482016-02-02 23:17:06 -06002885 br x30
DRCec6941f2016-01-15 09:29:11 -06002886
2887 .unreq IMAGE_WIDTH
2888 .unreq MAX_V_SAMP
2889 .unreq V_SAMP
2890 .unreq BLOCK_WIDTH
2891 .unreq INPUT_DATA
2892 .unreq OUTPUT_DATA
2893 .unreq OUTPTR
2894 .unreq INPTR
2895 .unreq TMP1
2896 .unreq TMP2
2897 .unreq TMP3
2898 .unreq TMPDUP
2899
DRCcf888482016-02-02 23:17:06 -06002900
DRCec6941f2016-01-15 09:29:11 -06002901/*****************************************************************************/
2902
2903/*
2904 * Downsample pixel values of a single component.
2905 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2906 * without smoothing.
2907 *
2908 * GLOBAL(void)
2909 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2910 * JDIMENSION v_samp_factor, JDIMENSION width_blocks,
2911 * JSAMPARRAY input_data, JSAMPARRAY output_data);
2912 */
2913
2914.balign 16
DRCec6941f2016-01-15 09:29:11 -06002915asm_function jsimd_h2v2_downsample_neon
2916 IMAGE_WIDTH .req x0
2917 MAX_V_SAMP .req x1
2918 V_SAMP .req x2
2919 BLOCK_WIDTH .req x3
2920 INPUT_DATA .req x4
2921 OUTPUT_DATA .req x5
2922 OUTPTR .req x9
2923 INPTR0 .req x10
2924 INPTR1 .req x14
2925 TMP1 .req x11
2926 TMP2 .req x12
2927 TMP3 .req x13
2928 TMPDUP .req w15
2929
DRCcf888482016-02-02 23:17:06 -06002930 mov TMPDUP, #1
2931 lsl TMP2, BLOCK_WIDTH, #4
2932 lsl TMPDUP, TMPDUP, #17
2933 sub TMP2, TMP2, IMAGE_WIDTH
2934 adr TMP3, Ljsimd_h2_downsample_neon_consts
2935 orr TMPDUP, TMPDUP, #1
2936 add TMP3, TMP3, TMP2, lsl #4
2937 dup v16.4s, TMPDUP
2938 ld1 {v18.16b}, [TMP3]
DRCec6941f2016-01-15 09:29:11 -06002939
29401: /* row loop */
DRCcf888482016-02-02 23:17:06 -06002941 ldr INPTR0, [INPUT_DATA], #8
2942 ldr OUTPTR, [OUTPUT_DATA], #8
2943 ldr INPTR1, [INPUT_DATA], #8
2944 subs TMP1, BLOCK_WIDTH, #1
2945 b.eq 3f
DRCec6941f2016-01-15 09:29:11 -060029462: /* columns */
DRCcf888482016-02-02 23:17:06 -06002947 ld1 {v0.16b}, [INPTR0], #16
2948 ld1 {v1.16b}, [INPTR1], #16
2949 mov v4.16b, v16.16b
2950 subs TMP1, TMP1, #1
2951 uadalp v4.8h, v0.16b
2952 uadalp v4.8h, v1.16b
2953 shrn v6.8b, v4.8h, #2
2954 st1 {v6.8b}, [OUTPTR], #8
2955 b.ne 2b
DRCec6941f2016-01-15 09:29:11 -060029563: /* last columns */
DRCcf888482016-02-02 23:17:06 -06002957 ld1 {v0.16b}, [INPTR0], #16
2958 ld1 {v1.16b}, [INPTR1], #16
2959 mov v4.16b, v16.16b
2960 subs V_SAMP, V_SAMP, #1
DRCec6941f2016-01-15 09:29:11 -06002961 /* expand right */
DRCcf888482016-02-02 23:17:06 -06002962 tbl v2.16b, {v0.16b}, v18.16b
2963 tbl v3.16b, {v1.16b}, v18.16b
2964 uadalp v4.8h, v2.16b
2965 uadalp v4.8h, v3.16b
2966 shrn v6.8b, v4.8h, #2
2967 st1 {v6.8b}, [OUTPTR], #8
2968 b.ne 1b
DRCec6941f2016-01-15 09:29:11 -06002969
DRCcf888482016-02-02 23:17:06 -06002970 br x30
DRCec6941f2016-01-15 09:29:11 -06002971
2972 .unreq IMAGE_WIDTH
2973 .unreq MAX_V_SAMP
2974 .unreq V_SAMP
2975 .unreq BLOCK_WIDTH
2976 .unreq INPUT_DATA
2977 .unreq OUTPUT_DATA
2978 .unreq OUTPTR
2979 .unreq INPTR0
2980 .unreq INPTR1
2981 .unreq TMP1
2982 .unreq TMP2
2983 .unreq TMP3
2984 .unreq TMPDUP
DRC219470d2016-02-07 20:36:02 -06002985
2986
2987/*****************************************************************************/
2988
DRC219470d2016-02-07 20:36:02 -06002989/*
2990 * GLOBAL(JOCTET*)
DRCbd498032016-02-19 08:53:33 -06002991 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
DRC219470d2016-02-07 20:36:02 -06002992 * JCOEFPTR block, int last_dc_val,
2993 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2994 *
2995 */
2996
2997 BUFFER .req x1
2998 PUT_BUFFER .req x6
2999 PUT_BITS .req x7
3000 PUT_BITSw .req w7
3001
3002.macro emit_byte
3003 sub PUT_BITS, PUT_BITS, #0x8
3004 lsr x19, PUT_BUFFER, PUT_BITS
3005 uxtb w19, w19
3006 strb w19, [BUFFER, #1]!
3007 cmp w19, #0xff
3008 b.ne 14f
3009 strb wzr, [BUFFER, #1]!
301014:
3011.endm
3012.macro put_bits CODE, SIZE
3013 lsl PUT_BUFFER, PUT_BUFFER, \SIZE
3014 add PUT_BITS, PUT_BITS, \SIZE
3015 orr PUT_BUFFER, PUT_BUFFER, \CODE
3016.endm
3017.macro checkbuf31
3018 cmp PUT_BITS, #0x20
3019 b.lt 31f
3020 emit_byte
3021 emit_byte
3022 emit_byte
3023 emit_byte
302431:
3025.endm
3026.macro checkbuf47
3027 cmp PUT_BITS, #0x30
3028 b.lt 47f
3029 emit_byte
3030 emit_byte
3031 emit_byte
3032 emit_byte
3033 emit_byte
3034 emit_byte
303547:
3036.endm
3037
DRC8632f1b2016-02-09 00:38:58 -06003038.macro generate_jsimd_huff_encode_one_block fast_tbl
3039
DRC219470d2016-02-07 20:36:02 -06003040.balign 16
DRC8632f1b2016-02-09 00:38:58 -06003041.if \fast_tbl == 1
DRC219470d2016-02-07 20:36:02 -06003042Ljsimd_huff_encode_one_block_neon_consts:
DRC8632f1b2016-02-09 00:38:58 -06003043.else
3044Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3045.endif
DRC219470d2016-02-07 20:36:02 -06003046 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3047 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
DRC8632f1b2016-02-09 00:38:58 -06003048.if \fast_tbl == 1
DRC219470d2016-02-07 20:36:02 -06003049 .byte 0, 1, 2, 3, 16, 17, 32, 33, \
3050 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
3051 .byte 34, 35, 48, 49, 255, 255, 50, 51, \
3052 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
3053 .byte 8, 9, 22, 23, 36, 37, 50, 51, \
3054 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
3055 .byte 54, 55, 40, 41, 26, 27, 12, 13, \
3056 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
3057 .byte 6, 7, 20, 21, 34, 35, 48, 49, \
3058 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
3059 .byte 42, 43, 28, 29, 14, 15, 30, 31, \
3060 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
3061 .byte 255, 255, 255, 255, 56, 57, 42, 43, \
3062 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
3063 .byte 26, 27, 40, 41, 42, 43, 28, 29, \
3064 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
3065 .byte 255, 255, 255, 255, 0, 1, 255, 255, \
3066 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
3067 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3068 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
3069 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3070 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
3071 .byte 4, 5, 6, 7, 255, 255, 255, 255, \
3072 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
DRC8632f1b2016-02-09 00:38:58 -06003073.endif
DRC219470d2016-02-07 20:36:02 -06003074
DRC8632f1b2016-02-09 00:38:58 -06003075.if \fast_tbl == 1
DRC219470d2016-02-07 20:36:02 -06003076asm_function jsimd_huff_encode_one_block_neon
DRC8632f1b2016-02-09 00:38:58 -06003077.else
3078asm_function jsimd_huff_encode_one_block_neon_slowtbl
3079.endif
DRC219470d2016-02-07 20:36:02 -06003080 sub sp, sp, 272
3081 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
3082 /* Save ARM registers */
3083 stp x19, x20, [sp], 16
DRC8632f1b2016-02-09 00:38:58 -06003084.if \fast_tbl == 1
DRC219470d2016-02-07 20:36:02 -06003085 adr x15, Ljsimd_huff_encode_one_block_neon_consts
DRC8632f1b2016-02-09 00:38:58 -06003086.else
3087 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3088.endif
DRC219470d2016-02-07 20:36:02 -06003089 ldr PUT_BUFFER, [x0, #0x10]
3090 ldr PUT_BITSw, [x0, #0x18]
3091 ldrsh w12, [x2] /* load DC coeff in w12 */
3092 /* prepare data */
DRC8632f1b2016-02-09 00:38:58 -06003093.if \fast_tbl == 1
DRC219470d2016-02-07 20:36:02 -06003094 ld1 {v23.16b}, [x15], #16
3095 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3096 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3097 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3098 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3099 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3100 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3101 /* ZigZag 8x8 */
3102 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3103 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3104 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3105 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3106 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3107 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3108 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3109 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3110 ins v0.h[0], w12
3111 tbx v1.16b, {v28.16b}, v16.16b
3112 tbx v2.16b, {v29.16b, v30.16b}, v17.16b
3113 tbx v5.16b, {v29.16b, v30.16b}, v18.16b
3114 tbx v6.16b, {v31.16b}, v19.16b
DRC8632f1b2016-02-09 00:38:58 -06003115.else
DRC219470d2016-02-07 20:36:02 -06003116 add x13, x2, #0x22
3117 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3118 ld1 {v23.16b}, [x15]
3119 add x14, x2, #0x18
3120 add x3, x2, #0x36
3121 ins v0.h[0], w12
3122 add x9, x2, #0x2
3123 ld1 {v1.h}[0], [x13]
3124 add x15, x2, #0x30
3125 ld1 {v2.h}[0], [x14]
3126 add x19, x2, #0x26
3127 ld1 {v3.h}[0], [x3]
3128 add x20, x2, #0x28
3129 ld1 {v0.h}[1], [x9]
3130 add x12, x2, #0x10
3131 ld1 {v1.h}[1], [x15]
3132 add x13, x2, #0x40
3133 ld1 {v2.h}[1], [x19]
3134 add x14, x2, #0x34
3135 ld1 {v3.h}[1], [x20]
3136 add x3, x2, #0x1a
3137 ld1 {v0.h}[2], [x12]
3138 add x9, x2, #0x20
3139 ld1 {v1.h}[2], [x13]
3140 add x15, x2, #0x32
3141 ld1 {v2.h}[2], [x14]
3142 add x19, x2, #0x42
3143 ld1 {v3.h}[2], [x3]
3144 add x20, x2, #0xc
3145 ld1 {v0.h}[3], [x9]
3146 add x12, x2, #0x12
3147 ld1 {v1.h}[3], [x15]
3148 add x13, x2, #0x24
3149 ld1 {v2.h}[3], [x19]
3150 add x14, x2, #0x50
3151 ld1 {v3.h}[3], [x20]
3152 add x3, x2, #0xe
3153 ld1 {v0.h}[4], [x12]
3154 add x9, x2, #0x4
3155 ld1 {v1.h}[4], [x13]
3156 add x15, x2, #0x16
3157 ld1 {v2.h}[4], [x14]
3158 add x19, x2, #0x60
3159 ld1 {v3.h}[4], [x3]
3160 add x20, x2, #0x1c
3161 ld1 {v0.h}[5], [x9]
3162 add x12, x2, #0x6
3163 ld1 {v1.h}[5], [x15]
3164 add x13, x2, #0x8
3165 ld1 {v2.h}[5], [x19]
3166 add x14, x2, #0x52
3167 ld1 {v3.h}[5], [x20]
3168 add x3, x2, #0x2a
3169 ld1 {v0.h}[6], [x12]
3170 add x9, x2, #0x14
3171 ld1 {v1.h}[6], [x13]
3172 add x15, x2, #0xa
3173 ld1 {v2.h}[6], [x14]
3174 add x19, x2, #0x44
3175 ld1 {v3.h}[6], [x3]
3176 add x20, x2, #0x38
3177 ld1 {v0.h}[7], [x9]
3178 add x12, x2, #0x46
3179 ld1 {v1.h}[7], [x15]
3180 add x13, x2, #0x3a
3181 ld1 {v2.h}[7], [x19]
3182 add x14, x2, #0x74
3183 ld1 {v3.h}[7], [x20]
3184 add x3, x2, #0x6a
3185 ld1 {v4.h}[0], [x12]
3186 add x9, x2, #0x54
3187 ld1 {v5.h}[0], [x13]
3188 add x15, x2, #0x2c
3189 ld1 {v6.h}[0], [x14]
3190 add x19, x2, #0x76
3191 ld1 {v7.h}[0], [x3]
3192 add x20, x2, #0x78
3193 ld1 {v4.h}[1], [x9]
3194 add x12, x2, #0x62
3195 ld1 {v5.h}[1], [x15]
3196 add x13, x2, #0x1e
3197 ld1 {v6.h}[1], [x19]
3198 add x14, x2, #0x68
3199 ld1 {v7.h}[1], [x20]
3200 add x3, x2, #0x7a
3201 ld1 {v4.h}[2], [x12]
3202 add x9, x2, #0x70
3203 ld1 {v5.h}[2], [x13]
3204 add x15, x2, #0x2e
3205 ld1 {v6.h}[2], [x14]
3206 add x19, x2, #0x5a
3207 ld1 {v7.h}[2], [x3]
3208 add x20, x2, #0x6c
3209 ld1 {v4.h}[3], [x9]
3210 add x12, x2, #0x72
3211 ld1 {v5.h}[3], [x15]
3212 add x13, x2, #0x3c
3213 ld1 {v6.h}[3], [x19]
3214 add x14, x2, #0x4c
3215 ld1 {v7.h}[3], [x20]
3216 add x3, x2, #0x5e
3217 ld1 {v4.h}[4], [x12]
3218 add x9, x2, #0x64
3219 ld1 {v5.h}[4], [x13]
3220 add x15, x2, #0x4a
3221 ld1 {v6.h}[4], [x14]
3222 add x19, x2, #0x3e
3223 ld1 {v7.h}[4], [x3]
3224 add x20, x2, #0x6e
3225 ld1 {v4.h}[5], [x9]
3226 add x12, x2, #0x56
3227 ld1 {v5.h}[5], [x15]
3228 add x13, x2, #0x58
3229 ld1 {v6.h}[5], [x19]
3230 add x14, x2, #0x4e
3231 ld1 {v7.h}[5], [x20]
3232 add x3, x2, #0x7c
3233 ld1 {v4.h}[6], [x12]
3234 add x9, x2, #0x48
3235 ld1 {v5.h}[6], [x13]
3236 add x15, x2, #0x66
3237 ld1 {v6.h}[6], [x14]
3238 add x19, x2, #0x5c
3239 ld1 {v7.h}[6], [x3]
3240 add x20, x2, #0x7e
3241 ld1 {v4.h}[7], [x9]
3242 ld1 {v5.h}[7], [x15]
3243 ld1 {v6.h}[7], [x19]
3244 ld1 {v7.h}[7], [x20]
DRC8632f1b2016-02-09 00:38:58 -06003245.endif
DRC219470d2016-02-07 20:36:02 -06003246 cmlt v24.8h, v0.8h, #0
3247 cmlt v25.8h, v1.8h, #0
3248 cmlt v26.8h, v2.8h, #0
3249 cmlt v27.8h, v3.8h, #0
3250 cmlt v28.8h, v4.8h, #0
3251 cmlt v29.8h, v5.8h, #0
3252 cmlt v30.8h, v6.8h, #0
3253 cmlt v31.8h, v7.8h, #0
3254 abs v0.8h, v0.8h
3255 abs v1.8h, v1.8h
3256 abs v2.8h, v2.8h
3257 abs v3.8h, v3.8h
3258 abs v4.8h, v4.8h
3259 abs v5.8h, v5.8h
3260 abs v6.8h, v6.8h
3261 abs v7.8h, v7.8h
3262 eor v24.16b, v24.16b, v0.16b
3263 eor v25.16b, v25.16b, v1.16b
3264 eor v26.16b, v26.16b, v2.16b
3265 eor v27.16b, v27.16b, v3.16b
3266 eor v28.16b, v28.16b, v4.16b
3267 eor v29.16b, v29.16b, v5.16b
3268 eor v30.16b, v30.16b, v6.16b
3269 eor v31.16b, v31.16b, v7.16b
3270 cmeq v16.8h, v0.8h, #0
3271 cmeq v17.8h, v1.8h, #0
3272 cmeq v18.8h, v2.8h, #0
3273 cmeq v19.8h, v3.8h, #0
3274 cmeq v20.8h, v4.8h, #0
3275 cmeq v21.8h, v5.8h, #0
3276 cmeq v22.8h, v6.8h, #0
3277 xtn v16.8b, v16.8h
3278 xtn v18.8b, v18.8h
3279 xtn v20.8b, v20.8h
3280 xtn v22.8b, v22.8h
3281 umov w14, v0.h[0]
3282 xtn2 v16.16b, v17.8h
3283 umov w13, v24.h[0]
3284 xtn2 v18.16b, v19.8h
3285 clz w14, w14
3286 xtn2 v20.16b, v21.8h
3287 lsl w13, w13, w14
3288 cmeq v17.8h, v7.8h, #0
3289 sub w12, w14, #32
3290 xtn2 v22.16b, v17.8h
3291 lsr w13, w13, w14
3292 and v16.16b, v16.16b, v23.16b
3293 neg w12, w12
3294 and v18.16b, v18.16b, v23.16b
3295 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
3296 and v20.16b, v20.16b, v23.16b
3297 add x15, sp, #0x80 /* x15 = t2 */
3298 and v22.16b, v22.16b, v23.16b
3299 ldr w10, [x4, x12, lsl #2]
3300 addp v16.16b, v16.16b, v18.16b
3301 ldrb w11, [x3, x12]
3302 addp v20.16b, v20.16b, v22.16b
3303 checkbuf47
3304 addp v16.16b, v16.16b, v20.16b
3305 put_bits x10, x11
3306 addp v16.16b, v16.16b, v18.16b
3307 checkbuf47
3308 umov x9,v16.D[0]
3309 put_bits x13, x12
3310 cnt v17.8b, v16.8b
3311 mvn x9, x9
3312 addv B18, v17.8b
3313 add x4, x5, #0x400 /* x4 = actbl->ehufsi */
3314 umov w12, v18.b[0]
3315 lsr x9, x9, #0x1 /* clear AC coeff */
3316 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
3317 rbit x9, x9 /* x9 = index0 */
3318 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
3319 cmp w12, #(64-8)
3320 mov x11, sp
3321 b.lt 4f
3322 cbz x9, 6f
3323 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3324 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3325 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3326 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33271:
3328 clz x2, x9
3329 add x15, x15, x2, lsl #1
3330 lsl x9, x9, x2
3331 ldrh w20, [x15, #-126]
33322:
3333 cmp x2, #0x10
3334 b.lt 3f
3335 sub x2, x2, #0x10
3336 checkbuf47
3337 put_bits x13, x14
3338 b 2b
33393:
3340 clz w20, w20
3341 ldrh w3, [x15, #2]!
3342 sub w11, w20, #32
3343 lsl w3, w3, w20
3344 neg w11, w11
3345 lsr w3, w3, w20
3346 add x2, x11, x2, lsl #4
3347 lsl x9, x9, #0x1
3348 ldr w12, [x5, x2, lsl #2]
3349 ldrb w10, [x4, x2]
3350 checkbuf31
3351 put_bits x12, x10
3352 put_bits x3, x11
3353 cbnz x9, 1b
3354 b 6f
33554:
3356 movi v21.8h, #0x0010
3357 clz v0.8h, v0.8h
3358 clz v1.8h, v1.8h
3359 clz v2.8h, v2.8h
3360 clz v3.8h, v3.8h
3361 clz v4.8h, v4.8h
3362 clz v5.8h, v5.8h
3363 clz v6.8h, v6.8h
3364 clz v7.8h, v7.8h
3365 ushl v24.8h, v24.8h, v0.8h
3366 ushl v25.8h, v25.8h, v1.8h
3367 ushl v26.8h, v26.8h, v2.8h
3368 ushl v27.8h, v27.8h, v3.8h
3369 ushl v28.8h, v28.8h, v4.8h
3370 ushl v29.8h, v29.8h, v5.8h
3371 ushl v30.8h, v30.8h, v6.8h
3372 ushl v31.8h, v31.8h, v7.8h
3373 neg v0.8h, v0.8h
3374 neg v1.8h, v1.8h
3375 neg v2.8h, v2.8h
3376 neg v3.8h, v3.8h
3377 neg v4.8h, v4.8h
3378 neg v5.8h, v5.8h
3379 neg v6.8h, v6.8h
3380 neg v7.8h, v7.8h
3381 ushl v24.8h, v24.8h, v0.8h
3382 ushl v25.8h, v25.8h, v1.8h
3383 ushl v26.8h, v26.8h, v2.8h
3384 ushl v27.8h, v27.8h, v3.8h
3385 ushl v28.8h, v28.8h, v4.8h
3386 ushl v29.8h, v29.8h, v5.8h
3387 ushl v30.8h, v30.8h, v6.8h
3388 ushl v31.8h, v31.8h, v7.8h
3389 add v0.8h, v21.8h, v0.8h
3390 add v1.8h, v21.8h, v1.8h
3391 add v2.8h, v21.8h, v2.8h
3392 add v3.8h, v21.8h, v3.8h
3393 add v4.8h, v21.8h, v4.8h
3394 add v5.8h, v21.8h, v5.8h
3395 add v6.8h, v21.8h, v6.8h
3396 add v7.8h, v21.8h, v7.8h
3397 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3398 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3399 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3400 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
34011:
3402 clz x2, x9
3403 add x15, x15, x2, lsl #1
3404 lsl x9, x9, x2
3405 ldrh w11, [x15, #-126]
34062:
3407 cmp x2, #0x10
3408 b.lt 3f
3409 sub x2, x2, #0x10
3410 checkbuf47
3411 put_bits x13, x14
3412 b 2b
34133:
3414 ldrh w3, [x15, #2]!
3415 add x2, x11, x2, lsl #4
3416 lsl x9, x9, #0x1
3417 ldr w12, [x5, x2, lsl #2]
3418 ldrb w10, [x4, x2]
3419 checkbuf31
3420 put_bits x12, x10
3421 put_bits x3, x11
3422 cbnz x9, 1b
34236:
3424 add x13, sp, #0xfe
3425 cmp x15, x13
3426 b.hs 1f
3427 ldr w12, [x5]
3428 ldrb w14, [x4]
3429 checkbuf47
3430 put_bits x12, x14
34311:
3432 sub sp, sp, 16
3433 str PUT_BUFFER, [x0, #0x10]
3434 str PUT_BITSw, [x0, #0x18]
3435 ldp x19, x20, [sp], 16
3436 add x0, BUFFER, #0x1
3437 add sp, sp, 256
3438 br x30
3439
DRC8632f1b2016-02-09 00:38:58 -06003440.endm
3441
3442generate_jsimd_huff_encode_one_block 1
3443generate_jsimd_huff_encode_one_block 0
3444
DRC219470d2016-02-07 20:36:02 -06003445 .unreq BUFFER
3446 .unreq PUT_BUFFER
3447 .unreq PUT_BITS
3448 .unreq PUT_BITSw
3449
3450.purgem emit_byte
3451.purgem put_bits
3452.purgem checkbuf31
3453.purgem checkbuf47