blob: ac6c8607b26dc1198a9ba8ebd841532c3ac4ffcf [file] [log] [blame]
DRC321e0682011-05-03 08:47:43 +00001/*
DRC3e00f032014-02-05 07:40:00 +00002 * ARMv7 NEON optimizations for libjpeg-turbo
DRC321e0682011-05-03 08:47:43 +00003 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
DRCb071f012011-09-06 18:58:22 +00006 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
DRC321e0682011-05-03 08:47:43 +00007 *
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#if defined(__linux__) && defined(__ELF__)
26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
27#endif
28
29.text
30.fpu neon
31.arch armv7a
32.object_arch armv4
DRC321e0682011-05-03 08:47:43 +000033.arm
34
DRC8c60d222011-06-17 21:12:58 +000035
36#define RESPECT_STRICT_ALIGNMENT 1
37
DRC3e00f032014-02-05 07:40:00 +000038
DRC321e0682011-05-03 08:47:43 +000039/*****************************************************************************/
40
41/* Supplementary macro for setting function attributes */
42.macro asm_function fname
DRC4346f912011-06-14 22:16:50 +000043#ifdef __APPLE__
44 .func _\fname
45 .globl _\fname
46_\fname:
47#else
48 .func \fname
49 .global \fname
DRC321e0682011-05-03 08:47:43 +000050#ifdef __ELF__
DRC4346f912011-06-14 22:16:50 +000051 .hidden \fname
52 .type \fname, %function
DRC321e0682011-05-03 08:47:43 +000053#endif
DRC4346f912011-06-14 22:16:50 +000054\fname:
55#endif
DRC321e0682011-05-03 08:47:43 +000056.endm
57
58/* Transpose a block of 4x4 coefficients in four 64-bit registers */
59.macro transpose_4x4 x0, x1, x2, x3
DRC4346f912011-06-14 22:16:50 +000060 vtrn.16 \x0, \x1
61 vtrn.16 \x2, \x3
62 vtrn.32 \x0, \x2
63 vtrn.32 \x1, \x3
DRC321e0682011-05-03 08:47:43 +000064.endm
65
DRC3e00f032014-02-05 07:40:00 +000066
DRCce4e3e82011-08-22 13:48:01 +000067#define CENTERJSAMPLE 128
68
69/*****************************************************************************/
70
71/*
72 * Perform dequantization and inverse DCT on one block of coefficients.
73 *
74 * GLOBAL(void)
75 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
76 * JSAMPARRAY output_buf, JDIMENSION output_col)
77 */
78
79#define FIX_0_298631336 (2446)
80#define FIX_0_390180644 (3196)
81#define FIX_0_541196100 (4433)
82#define FIX_0_765366865 (6270)
83#define FIX_0_899976223 (7373)
84#define FIX_1_175875602 (9633)
85#define FIX_1_501321110 (12299)
86#define FIX_1_847759065 (15137)
87#define FIX_1_961570560 (16069)
88#define FIX_2_053119869 (16819)
89#define FIX_2_562915447 (20995)
90#define FIX_3_072711026 (25172)
91
92#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
93#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
94#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
95#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
96#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
97#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
98#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
99#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
100
101/*
102 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
103 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
104 */
105#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
106{ \
107 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
108 INT32 q1, q2, q3, q4, q5, q6, q7; \
109 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
110 \
111 /* 1-D iDCT input data */ \
112 row0 = xrow0; \
113 row1 = xrow1; \
114 row2 = xrow2; \
115 row3 = xrow3; \
116 row4 = xrow4; \
117 row5 = xrow5; \
118 row6 = xrow6; \
119 row7 = xrow7; \
120 \
121 q5 = row7 + row3; \
122 q4 = row5 + row1; \
123 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
124 MULTIPLY(q4, FIX_1_175875602); \
125 q7 = MULTIPLY(q5, FIX_1_175875602) + \
126 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
127 q2 = MULTIPLY(row2, FIX_0_541196100) + \
128 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
129 q4 = q6; \
130 q3 = ((INT32) row0 - (INT32) row4) << 13; \
131 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
132 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
133 /* now we can use q1 (reloadable constants have been used up) */ \
134 q1 = q3 + q2; \
135 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
136 MULTIPLY(row1, -FIX_0_899976223); \
137 q5 = q7; \
138 q1 = q1 + q6; \
139 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
140 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
141 \
142 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
143 tmp11_plus_tmp2 = q1; \
144 row1 = 0; \
145 \
146 q1 = q1 - q6; \
147 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
148 MULTIPLY(row3, -FIX_2_562915447); \
149 q1 = q1 - q6; \
150 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
151 MULTIPLY(row6, FIX_0_541196100); \
152 q3 = q3 - q2; \
153 \
154 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
155 tmp11_minus_tmp2 = q1; \
156 \
157 q1 = ((INT32) row0 + (INT32) row4) << 13; \
158 q2 = q1 + q6; \
159 q1 = q1 - q6; \
160 \
161 /* pick up the results */ \
162 tmp0 = q4; \
163 tmp1 = q5; \
164 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
165 tmp3 = q7; \
166 tmp10 = q2; \
167 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
168 tmp12 = q3; \
169 tmp13 = q1; \
170}
171
172#define XFIX_0_899976223 d0[0]
173#define XFIX_0_541196100 d0[1]
174#define XFIX_2_562915447 d0[2]
175#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
176#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
177#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
178#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
179#define XFIX_1_175875602 d1[3]
180#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
181#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
182#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
183#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
184
185.balign 16
186jsimd_idct_islow_neon_consts:
187 .short FIX_0_899976223 /* d0[0] */
188 .short FIX_0_541196100 /* d0[1] */
189 .short FIX_2_562915447 /* d0[2] */
190 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
191 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
192 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
193 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
194 .short FIX_1_175875602 /* d1[3] */
195 /* reloadable constants */
196 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
197 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
198 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
199 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
200
201asm_function jsimd_idct_islow_neon
202
203 DCT_TABLE .req r0
204 COEF_BLOCK .req r1
205 OUTPUT_BUF .req r2
206 OUTPUT_COL .req r3
207 TMP1 .req r0
208 TMP2 .req r1
209 TMP3 .req r2
210 TMP4 .req ip
211
212 ROW0L .req d16
213 ROW0R .req d17
214 ROW1L .req d18
215 ROW1R .req d19
216 ROW2L .req d20
217 ROW2R .req d21
218 ROW3L .req d22
219 ROW3R .req d23
220 ROW4L .req d24
221 ROW4R .req d25
222 ROW5L .req d26
223 ROW5R .req d27
224 ROW6L .req d28
225 ROW6R .req d29
226 ROW7L .req d30
227 ROW7R .req d31
228
229 /* Load and dequantize coefficients into NEON registers
230 * with the following allocation:
231 * 0 1 2 3 | 4 5 6 7
232 * ---------+--------
233 * 0 | d16 | d17 ( q8 )
234 * 1 | d18 | d19 ( q9 )
235 * 2 | d20 | d21 ( q10 )
236 * 3 | d22 | d23 ( q11 )
237 * 4 | d24 | d25 ( q12 )
238 * 5 | d26 | d27 ( q13 )
239 * 6 | d28 | d29 ( q14 )
240 * 7 | d30 | d31 ( q15 )
241 */
242 adr ip, jsimd_idct_islow_neon_consts
243 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
244 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
245 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
246 vmul.s16 q8, q8, q0
247 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
248 vmul.s16 q9, q9, q1
249 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
250 vmul.s16 q10, q10, q2
251 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
252 vmul.s16 q11, q11, q3
253 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
254 vmul.s16 q12, q12, q0
255 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
256 vmul.s16 q14, q14, q2
257 vmul.s16 q13, q13, q1
258 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
259 add ip, ip, #16
260 vmul.s16 q15, q15, q3
261 vpush {d8-d15} /* save NEON registers */
262 /* 1-D IDCT, pass 1, left 4x8 half */
263 vadd.s16 d4, ROW7L, ROW3L
264 vadd.s16 d5, ROW5L, ROW1L
265 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
266 vmlal.s16 q6, d5, XFIX_1_175875602
267 vmull.s16 q7, d4, XFIX_1_175875602
DRC5129e392011-09-06 18:55:45 +0000268 /* Check for the zero coefficients in the right 4x8 half */
269 push {r4, r5}
DRCce4e3e82011-08-22 13:48:01 +0000270 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
271 vsubl.s16 q3, ROW0L, ROW4L
DRC5129e392011-09-06 18:55:45 +0000272 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000273 vmull.s16 q2, ROW2L, XFIX_0_541196100
274 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
DRC5129e392011-09-06 18:55:45 +0000275 orr r0, r4, r5
DRCce4e3e82011-08-22 13:48:01 +0000276 vmov q4, q6
277 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000278 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000279 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
280 vshl.s32 q3, q3, #13
DRC5129e392011-09-06 18:55:45 +0000281 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000282 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000283 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000284 vadd.s32 q1, q3, q2
DRC5129e392011-09-06 18:55:45 +0000285 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000286 vmov q5, q7
287 vadd.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000288 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000289 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000290 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000291 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
292 vrshrn.s32 ROW1L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000293 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000294 vsub.s32 q1, q1, q6
295 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
DRC5129e392011-09-06 18:55:45 +0000296 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000297 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000298 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000299 vsub.s32 q1, q1, q6
300 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
DRC5129e392011-09-06 18:55:45 +0000301 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000302 vmlal.s16 q6, ROW6L, XFIX_0_541196100
303 vsub.s32 q3, q3, q2
DRC5129e392011-09-06 18:55:45 +0000304 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000305 vrshrn.s32 ROW6L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000306 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000307 vadd.s32 q1, q3, q5
DRC5129e392011-09-06 18:55:45 +0000308 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000309 vsub.s32 q3, q3, q5
310 vaddl.s16 q5, ROW0L, ROW4L
DRC5129e392011-09-06 18:55:45 +0000311 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000312 vrshrn.s32 ROW2L, q1, #11
DRC5129e392011-09-06 18:55:45 +0000313 orr r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000314 vrshrn.s32 ROW5L, q3, #11
DRC5129e392011-09-06 18:55:45 +0000315 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000316 vshl.s32 q5, q5, #13
317 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
DRC5129e392011-09-06 18:55:45 +0000318 orr r0, r0, r4
DRCce4e3e82011-08-22 13:48:01 +0000319 vadd.s32 q2, q5, q6
DRC5129e392011-09-06 18:55:45 +0000320 orrs r0, r0, r5
DRCce4e3e82011-08-22 13:48:01 +0000321 vsub.s32 q1, q5, q6
322 vadd.s32 q6, q2, q7
DRC5129e392011-09-06 18:55:45 +0000323 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
DRCce4e3e82011-08-22 13:48:01 +0000324 vsub.s32 q2, q2, q7
325 vadd.s32 q5, q1, q4
DRC5129e392011-09-06 18:55:45 +0000326 orr r0, r4, r5
DRCce4e3e82011-08-22 13:48:01 +0000327 vsub.s32 q3, q1, q4
DRC5129e392011-09-06 18:55:45 +0000328 pop {r4, r5}
DRCce4e3e82011-08-22 13:48:01 +0000329 vrshrn.s32 ROW7L, q2, #11
330 vrshrn.s32 ROW3L, q5, #11
331 vrshrn.s32 ROW0L, q6, #11
332 vrshrn.s32 ROW4L, q3, #11
DRC5129e392011-09-06 18:55:45 +0000333
334 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
335
DRCce4e3e82011-08-22 13:48:01 +0000336 /* 1-D IDCT, pass 1, right 4x8 half */
337 vld1.s16 {d2}, [ip, :64] /* reload constants */
338 vadd.s16 d10, ROW7R, ROW3R
339 vadd.s16 d8, ROW5R, ROW1R
340 /* Transpose left 4x8 half */
341 vtrn.16 ROW6L, ROW7L
342 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
343 vmlal.s16 q6, d8, XFIX_1_175875602
344 vtrn.16 ROW2L, ROW3L
345 vmull.s16 q7, d10, XFIX_1_175875602
346 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
347 vtrn.16 ROW0L, ROW1L
348 vsubl.s16 q3, ROW0R, ROW4R
349 vmull.s16 q2, ROW2R, XFIX_0_541196100
350 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
351 vtrn.16 ROW4L, ROW5L
352 vmov q4, q6
353 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
354 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
355 vtrn.32 ROW1L, ROW3L
356 vshl.s32 q3, q3, #13
357 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
358 vtrn.32 ROW4L, ROW6L
359 vadd.s32 q1, q3, q2
360 vmov q5, q7
361 vadd.s32 q1, q1, q6
362 vtrn.32 ROW0L, ROW2L
363 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
364 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
365 vrshrn.s32 ROW1R, q1, #11
366 vtrn.32 ROW5L, ROW7L
367 vsub.s32 q1, q1, q6
368 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
369 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
370 vsub.s32 q1, q1, q6
371 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
372 vmlal.s16 q6, ROW6R, XFIX_0_541196100
373 vsub.s32 q3, q3, q2
374 vrshrn.s32 ROW6R, q1, #11
375 vadd.s32 q1, q3, q5
376 vsub.s32 q3, q3, q5
377 vaddl.s16 q5, ROW0R, ROW4R
378 vrshrn.s32 ROW2R, q1, #11
379 vrshrn.s32 ROW5R, q3, #11
380 vshl.s32 q5, q5, #13
381 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
382 vadd.s32 q2, q5, q6
383 vsub.s32 q1, q5, q6
384 vadd.s32 q6, q2, q7
385 vsub.s32 q2, q2, q7
386 vadd.s32 q5, q1, q4
387 vsub.s32 q3, q1, q4
388 vrshrn.s32 ROW7R, q2, #11
389 vrshrn.s32 ROW3R, q5, #11
390 vrshrn.s32 ROW0R, q6, #11
391 vrshrn.s32 ROW4R, q3, #11
DRC5129e392011-09-06 18:55:45 +0000392 /* Transpose right 4x8 half */
393 vtrn.16 ROW6R, ROW7R
394 vtrn.16 ROW2R, ROW3R
395 vtrn.16 ROW0R, ROW1R
396 vtrn.16 ROW4R, ROW5R
397 vtrn.32 ROW1R, ROW3R
398 vtrn.32 ROW4R, ROW6R
399 vtrn.32 ROW0R, ROW2R
400 vtrn.32 ROW5R, ROW7R
401
4021: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
DRCce4e3e82011-08-22 13:48:01 +0000403 vld1.s16 {d2}, [ip, :64] /* reload constants */
DRC5129e392011-09-06 18:55:45 +0000404 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
405 vmlal.s16 q6, ROW1L, XFIX_1_175875602
406 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
407 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
408 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
409 vmlal.s16 q7, ROW3L, XFIX_1_175875602
410 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
411 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
412 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000413 vmull.s16 q2, ROW2L, XFIX_0_541196100
DRC5129e392011-09-06 18:55:45 +0000414 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000415 vmov q4, q6
DRC5129e392011-09-06 18:55:45 +0000416 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000417 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
418 vshl.s32 q3, q3, #13
419 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
420 vadd.s32 q1, q3, q2
421 vmov q5, q7
422 vadd.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000423 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000424 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
425 vshrn.s32 ROW1L, q1, #16
426 vsub.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000427 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000428 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
429 vsub.s32 q1, q1, q6
430 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
DRC5129e392011-09-06 18:55:45 +0000431 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000432 vsub.s32 q3, q3, q2
DRC5129e392011-09-06 18:55:45 +0000433 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000434 vadd.s32 q1, q3, q5
435 vsub.s32 q3, q3, q5
DRC5129e392011-09-06 18:55:45 +0000436 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000437 vshrn.s32 ROW2L, q1, #16
DRC5129e392011-09-06 18:55:45 +0000438 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000439 vshl.s32 q5, q5, #13
DRC5129e392011-09-06 18:55:45 +0000440 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000441 vadd.s32 q2, q5, q6
442 vsub.s32 q1, q5, q6
443 vadd.s32 q6, q2, q7
444 vsub.s32 q2, q2, q7
445 vadd.s32 q5, q1, q4
446 vsub.s32 q3, q1, q4
DRC5129e392011-09-06 18:55:45 +0000447 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000448 vshrn.s32 ROW3L, q5, #16
449 vshrn.s32 ROW0L, q6, #16
DRC5129e392011-09-06 18:55:45 +0000450 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000451 /* 1-D IDCT, pass 2, right 4x8 half */
452 vld1.s16 {d2}, [ip, :64] /* reload constants */
DRC5129e392011-09-06 18:55:45 +0000453 vmull.s16 q6, ROW5R, XFIX_1_175875602
454 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
455 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
456 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
457 vmull.s16 q7, ROW7R, XFIX_1_175875602
458 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
459 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
460 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
461 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
462 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000463 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
464 vmov q4, q6
465 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
DRC5129e392011-09-06 18:55:45 +0000466 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000467 vshl.s32 q3, q3, #13
DRC5129e392011-09-06 18:55:45 +0000468 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000469 vadd.s32 q1, q3, q2
470 vmov q5, q7
471 vadd.s32 q1, q1, q6
472 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
DRC5129e392011-09-06 18:55:45 +0000473 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
474 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
DRCce4e3e82011-08-22 13:48:01 +0000475 vsub.s32 q1, q1, q6
476 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
DRC5129e392011-09-06 18:55:45 +0000477 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
DRCce4e3e82011-08-22 13:48:01 +0000478 vsub.s32 q1, q1, q6
DRC5129e392011-09-06 18:55:45 +0000479 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000480 vmlal.s16 q6, ROW6R, XFIX_0_541196100
481 vsub.s32 q3, q3, q2
482 vshrn.s32 ROW6R, q1, #16
483 vadd.s32 q1, q3, q5
484 vsub.s32 q3, q3, q5
DRC5129e392011-09-06 18:55:45 +0000485 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
486 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
DRCce4e3e82011-08-22 13:48:01 +0000487 vshrn.s32 ROW5R, q3, #16
488 vshl.s32 q5, q5, #13
489 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
490 vadd.s32 q2, q5, q6
491 vsub.s32 q1, q5, q6
492 vadd.s32 q6, q2, q7
493 vsub.s32 q2, q2, q7
494 vadd.s32 q5, q1, q4
495 vsub.s32 q3, q1, q4
496 vshrn.s32 ROW7R, q2, #16
DRC5129e392011-09-06 18:55:45 +0000497 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
498 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
DRCce4e3e82011-08-22 13:48:01 +0000499 vshrn.s32 ROW4R, q3, #16
DRC5129e392011-09-06 18:55:45 +0000500
5012: /* Descale to 8-bit and range limit */
502 vqrshrn.s16 d16, q8, #2
503 vqrshrn.s16 d17, q9, #2
504 vqrshrn.s16 d18, q10, #2
505 vqrshrn.s16 d19, q11, #2
DRCce4e3e82011-08-22 13:48:01 +0000506 vpop {d8-d15} /* restore NEON registers */
DRC5129e392011-09-06 18:55:45 +0000507 vqrshrn.s16 d20, q12, #2
508 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
509 vtrn.16 q8, q9
510 vqrshrn.s16 d21, q13, #2
511 vqrshrn.s16 d22, q14, #2
512 vmov.u8 q0, #(CENTERJSAMPLE)
513 vqrshrn.s16 d23, q15, #2
514 vtrn.8 d16, d17
515 vtrn.8 d18, d19
516 vadd.u8 q8, q8, q0
517 vadd.u8 q9, q9, q0
518 vtrn.16 q10, q11
519 /* Store results to the output buffer */
520 ldmia OUTPUT_BUF!, {TMP1, TMP2}
521 add TMP1, TMP1, OUTPUT_COL
522 add TMP2, TMP2, OUTPUT_COL
523 vst1.8 {d16}, [TMP1]
524 vtrn.8 d20, d21
525 vst1.8 {d17}, [TMP2]
526 ldmia OUTPUT_BUF!, {TMP1, TMP2}
527 add TMP1, TMP1, OUTPUT_COL
528 add TMP2, TMP2, OUTPUT_COL
529 vst1.8 {d18}, [TMP1]
530 vadd.u8 q10, q10, q0
531 vst1.8 {d19}, [TMP2]
532 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
533 add TMP1, TMP1, OUTPUT_COL
534 add TMP2, TMP2, OUTPUT_COL
535 add TMP3, TMP3, OUTPUT_COL
536 add TMP4, TMP4, OUTPUT_COL
537 vtrn.8 d22, d23
538 vst1.8 {d20}, [TMP1]
539 vadd.u8 q11, q11, q0
540 vst1.8 {d21}, [TMP2]
541 vst1.8 {d22}, [TMP3]
542 vst1.8 {d23}, [TMP4]
DRCce4e3e82011-08-22 13:48:01 +0000543 bx lr
544
DRC5129e392011-09-06 18:55:45 +00005453: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
546
547 /* Transpose left 4x8 half */
548 vtrn.16 ROW6L, ROW7L
549 vtrn.16 ROW2L, ROW3L
550 vtrn.16 ROW0L, ROW1L
551 vtrn.16 ROW4L, ROW5L
552 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
553 vtrn.32 ROW1L, ROW3L
554 vtrn.32 ROW4L, ROW6L
555 vtrn.32 ROW0L, ROW2L
556 vtrn.32 ROW5L, ROW7L
557
558 cmp r0, #0
559 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
560
561 /* Only row 0 is non-zero for the right 4x8 half */
562 vdup.s16 ROW1R, ROW0R[1]
563 vdup.s16 ROW2R, ROW0R[2]
564 vdup.s16 ROW3R, ROW0R[3]
565 vdup.s16 ROW4R, ROW0R[0]
566 vdup.s16 ROW5R, ROW0R[1]
567 vdup.s16 ROW6R, ROW0R[2]
568 vdup.s16 ROW7R, ROW0R[3]
569 vdup.s16 ROW0R, ROW0R[0]
570 b 1b /* Go to 'normal' second pass */
571
5724: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
573 vld1.s16 {d2}, [ip, :64] /* reload constants */
574 vmull.s16 q6, ROW1L, XFIX_1_175875602
575 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
576 vmull.s16 q7, ROW3L, XFIX_1_175875602
577 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
578 vmull.s16 q2, ROW2L, XFIX_0_541196100
579 vshll.s16 q3, ROW0L, #13
580 vmov q4, q6
581 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
582 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
583 vadd.s32 q1, q3, q2
584 vmov q5, q7
585 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
586 vadd.s32 q1, q1, q6
587 vadd.s32 q6, q6, q6
588 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
589 vshrn.s32 ROW1L, q1, #16
590 vsub.s32 q1, q1, q6
591 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
592 vsub.s32 q3, q3, q2
593 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
594 vadd.s32 q1, q3, q5
595 vsub.s32 q3, q3, q5
596 vshll.s16 q5, ROW0L, #13
597 vshrn.s32 ROW2L, q1, #16
598 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
599 vadd.s32 q2, q5, q6
600 vsub.s32 q1, q5, q6
601 vadd.s32 q6, q2, q7
602 vsub.s32 q2, q2, q7
603 vadd.s32 q5, q1, q4
604 vsub.s32 q3, q1, q4
605 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
606 vshrn.s32 ROW3L, q5, #16
607 vshrn.s32 ROW0L, q6, #16
608 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
609 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
610 vld1.s16 {d2}, [ip, :64] /* reload constants */
611 vmull.s16 q6, ROW5L, XFIX_1_175875602
612 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
613 vmull.s16 q7, ROW7L, XFIX_1_175875602
614 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
615 vmull.s16 q2, ROW6L, XFIX_0_541196100
616 vshll.s16 q3, ROW4L, #13
617 vmov q4, q6
618 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
619 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
620 vadd.s32 q1, q3, q2
621 vmov q5, q7
622 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
623 vadd.s32 q1, q1, q6
624 vadd.s32 q6, q6, q6
625 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
626 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
627 vsub.s32 q1, q1, q6
628 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
629 vsub.s32 q3, q3, q2
630 vshrn.s32 ROW6R, q1, #16
631 vadd.s32 q1, q3, q5
632 vsub.s32 q3, q3, q5
633 vshll.s16 q5, ROW4L, #13
634 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
635 vshrn.s32 ROW5R, q3, #16
636 vadd.s32 q2, q5, q6
637 vsub.s32 q1, q5, q6
638 vadd.s32 q6, q2, q7
639 vsub.s32 q2, q2, q7
640 vadd.s32 q5, q1, q4
641 vsub.s32 q3, q1, q4
642 vshrn.s32 ROW7R, q2, #16
643 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
644 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
645 vshrn.s32 ROW4R, q3, #16
646 b 2b /* Go to epilogue */
647
DRCce4e3e82011-08-22 13:48:01 +0000648 .unreq DCT_TABLE
649 .unreq COEF_BLOCK
650 .unreq OUTPUT_BUF
651 .unreq OUTPUT_COL
652 .unreq TMP1
653 .unreq TMP2
654 .unreq TMP3
655 .unreq TMP4
656
657 .unreq ROW0L
658 .unreq ROW0R
659 .unreq ROW1L
660 .unreq ROW1R
661 .unreq ROW2L
662 .unreq ROW2R
663 .unreq ROW3L
664 .unreq ROW3R
665 .unreq ROW4L
666 .unreq ROW4R
667 .unreq ROW5L
668 .unreq ROW5R
669 .unreq ROW6L
670 .unreq ROW6R
671 .unreq ROW7L
672 .unreq ROW7R
673.endfunc
674
DRC3e00f032014-02-05 07:40:00 +0000675
DRC321e0682011-05-03 08:47:43 +0000676/*****************************************************************************/
677
678/*
679 * jsimd_idct_ifast_neon
680 *
681 * This function contains a fast, not so accurate integer implementation of
682 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
DRC4b024a62011-08-15 08:36:51 +0000683 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
DRC321e0682011-05-03 08:47:43 +0000684 * function from jidctfst.c
685 *
DRC4b024a62011-08-15 08:36:51 +0000686 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
687 * But in ARM NEON case some extra additions are required because VQDMULH
688 * instruction can't handle the constants larger than 1. So the expressions
689 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
690 * which introduces an extra addition. Overall, there are 6 extra additions
691 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
DRC321e0682011-05-03 08:47:43 +0000692 */
693
694#define XFIX_1_082392200 d0[0]
695#define XFIX_1_414213562 d0[1]
696#define XFIX_1_847759065 d0[2]
697#define XFIX_2_613125930 d0[3]
698
699.balign 16
700jsimd_idct_ifast_neon_consts:
701 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
702 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
703 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
704 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
705
DRC321e0682011-05-03 08:47:43 +0000706asm_function jsimd_idct_ifast_neon
707
708 DCT_TABLE .req r0
709 COEF_BLOCK .req r1
710 OUTPUT_BUF .req r2
711 OUTPUT_COL .req r3
DRC4b024a62011-08-15 08:36:51 +0000712 TMP1 .req r0
713 TMP2 .req r1
714 TMP3 .req r2
715 TMP4 .req ip
DRC321e0682011-05-03 08:47:43 +0000716
DRC4b024a62011-08-15 08:36:51 +0000717 /* Load and dequantize coefficients into NEON registers
718 * with the following allocation:
DRC321e0682011-05-03 08:47:43 +0000719 * 0 1 2 3 | 4 5 6 7
720 * ---------+--------
DRC4b024a62011-08-15 08:36:51 +0000721 * 0 | d16 | d17 ( q8 )
722 * 1 | d18 | d19 ( q9 )
723 * 2 | d20 | d21 ( q10 )
724 * 3 | d22 | d23 ( q11 )
725 * 4 | d24 | d25 ( q12 )
726 * 5 | d26 | d27 ( q13 )
727 * 6 | d28 | d29 ( q14 )
728 * 7 | d30 | d31 ( q15 )
DRC321e0682011-05-03 08:47:43 +0000729 */
DRC4b024a62011-08-15 08:36:51 +0000730 adr ip, jsimd_idct_ifast_neon_consts
731 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
732 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
733 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
734 vmul.s16 q8, q8, q0
735 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
736 vmul.s16 q9, q9, q1
737 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
738 vmul.s16 q10, q10, q2
739 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
740 vmul.s16 q11, q11, q3
741 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
742 vmul.s16 q12, q12, q0
743 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
744 vmul.s16 q14, q14, q2
745 vmul.s16 q13, q13, q1
746 vld1.16 {d0}, [ip, :64] /* load constants */
747 vmul.s16 q15, q15, q3
748 vpush {d8-d13} /* save NEON registers */
749 /* 1-D IDCT, pass 1 */
750 vsub.s16 q2, q10, q14
751 vadd.s16 q14, q10, q14
752 vsub.s16 q1, q11, q13
753 vadd.s16 q13, q11, q13
754 vsub.s16 q5, q9, q15
755 vadd.s16 q15, q9, q15
756 vqdmulh.s16 q4, q2, XFIX_1_414213562
757 vqdmulh.s16 q6, q1, XFIX_2_613125930
758 vadd.s16 q3, q1, q1
759 vsub.s16 q1, q5, q1
760 vadd.s16 q10, q2, q4
761 vqdmulh.s16 q4, q1, XFIX_1_847759065
762 vsub.s16 q2, q15, q13
763 vadd.s16 q3, q3, q6
764 vqdmulh.s16 q6, q2, XFIX_1_414213562
765 vadd.s16 q1, q1, q4
766 vqdmulh.s16 q4, q5, XFIX_1_082392200
767 vsub.s16 q10, q10, q14
768 vadd.s16 q2, q2, q6
769 vsub.s16 q6, q8, q12
770 vadd.s16 q12, q8, q12
771 vadd.s16 q9, q5, q4
772 vadd.s16 q5, q6, q10
773 vsub.s16 q10, q6, q10
774 vadd.s16 q6, q15, q13
775 vadd.s16 q8, q12, q14
776 vsub.s16 q3, q6, q3
777 vsub.s16 q12, q12, q14
778 vsub.s16 q3, q3, q1
779 vsub.s16 q1, q9, q1
780 vadd.s16 q2, q3, q2
781 vsub.s16 q15, q8, q6
782 vadd.s16 q1, q1, q2
783 vadd.s16 q8, q8, q6
784 vadd.s16 q14, q5, q3
785 vsub.s16 q9, q5, q3
786 vsub.s16 q13, q10, q2
787 vadd.s16 q10, q10, q2
788 /* Transpose */
789 vtrn.16 q8, q9
790 vsub.s16 q11, q12, q1
791 vtrn.16 q14, q15
792 vadd.s16 q12, q12, q1
793 vtrn.16 q10, q11
794 vtrn.16 q12, q13
795 vtrn.32 q9, q11
796 vtrn.32 q12, q14
797 vtrn.32 q8, q10
798 vtrn.32 q13, q15
799 vswp d28, d21
800 vswp d26, d19
801 /* 1-D IDCT, pass 2 */
802 vsub.s16 q2, q10, q14
803 vswp d30, d23
804 vadd.s16 q14, q10, q14
805 vswp d24, d17
806 vsub.s16 q1, q11, q13
807 vadd.s16 q13, q11, q13
808 vsub.s16 q5, q9, q15
809 vadd.s16 q15, q9, q15
810 vqdmulh.s16 q4, q2, XFIX_1_414213562
811 vqdmulh.s16 q6, q1, XFIX_2_613125930
812 vadd.s16 q3, q1, q1
813 vsub.s16 q1, q5, q1
814 vadd.s16 q10, q2, q4
815 vqdmulh.s16 q4, q1, XFIX_1_847759065
816 vsub.s16 q2, q15, q13
817 vadd.s16 q3, q3, q6
818 vqdmulh.s16 q6, q2, XFIX_1_414213562
819 vadd.s16 q1, q1, q4
820 vqdmulh.s16 q4, q5, XFIX_1_082392200
821 vsub.s16 q10, q10, q14
822 vadd.s16 q2, q2, q6
823 vsub.s16 q6, q8, q12
824 vadd.s16 q12, q8, q12
825 vadd.s16 q9, q5, q4
826 vadd.s16 q5, q6, q10
827 vsub.s16 q10, q6, q10
828 vadd.s16 q6, q15, q13
829 vadd.s16 q8, q12, q14
830 vsub.s16 q3, q6, q3
831 vsub.s16 q12, q12, q14
832 vsub.s16 q3, q3, q1
833 vsub.s16 q1, q9, q1
834 vadd.s16 q2, q3, q2
835 vsub.s16 q15, q8, q6
836 vadd.s16 q1, q1, q2
837 vadd.s16 q8, q8, q6
838 vadd.s16 q14, q5, q3
839 vsub.s16 q9, q5, q3
840 vsub.s16 q13, q10, q2
841 vpop {d8-d13} /* restore NEON registers */
842 vadd.s16 q10, q10, q2
DRC4b024a62011-08-15 08:36:51 +0000843 vsub.s16 q11, q12, q1
DRC4b024a62011-08-15 08:36:51 +0000844 vadd.s16 q12, q12, q1
DRCad6955d2011-09-06 18:57:53 +0000845 /* Descale to 8-bit and range limit */
846 vmov.u8 q0, #0x80
847 vqshrn.s16 d16, q8, #5
848 vqshrn.s16 d17, q9, #5
849 vqshrn.s16 d18, q10, #5
850 vqshrn.s16 d19, q11, #5
851 vqshrn.s16 d20, q12, #5
852 vqshrn.s16 d21, q13, #5
853 vqshrn.s16 d22, q14, #5
854 vqshrn.s16 d23, q15, #5
855 vadd.u8 q8, q8, q0
856 vadd.u8 q9, q9, q0
857 vadd.u8 q10, q10, q0
858 vadd.u8 q11, q11, q0
859 /* Transpose the final 8-bit samples */
860 vtrn.16 q8, q9
861 vtrn.16 q10, q11
862 vtrn.32 q8, q10
863 vtrn.32 q9, q11
864 vtrn.8 d16, d17
865 vtrn.8 d18, d19
DRC4b024a62011-08-15 08:36:51 +0000866 /* Store results to the output buffer */
867 ldmia OUTPUT_BUF!, {TMP1, TMP2}
868 add TMP1, TMP1, OUTPUT_COL
869 add TMP2, TMP2, OUTPUT_COL
DRC4b024a62011-08-15 08:36:51 +0000870 vst1.8 {d16}, [TMP1]
DRC4b024a62011-08-15 08:36:51 +0000871 vst1.8 {d17}, [TMP2]
DRC4b024a62011-08-15 08:36:51 +0000872 ldmia OUTPUT_BUF!, {TMP1, TMP2}
873 add TMP1, TMP1, OUTPUT_COL
874 add TMP2, TMP2, OUTPUT_COL
875 vst1.8 {d18}, [TMP1]
DRCad6955d2011-09-06 18:57:53 +0000876 vtrn.8 d20, d21
DRC4b024a62011-08-15 08:36:51 +0000877 vst1.8 {d19}, [TMP2]
DRC4b024a62011-08-15 08:36:51 +0000878 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
879 add TMP1, TMP1, OUTPUT_COL
880 add TMP2, TMP2, OUTPUT_COL
881 add TMP3, TMP3, OUTPUT_COL
882 add TMP4, TMP4, OUTPUT_COL
883 vst1.8 {d20}, [TMP1]
DRCad6955d2011-09-06 18:57:53 +0000884 vtrn.8 d22, d23
DRC4b024a62011-08-15 08:36:51 +0000885 vst1.8 {d21}, [TMP2]
886 vst1.8 {d22}, [TMP3]
887 vst1.8 {d23}, [TMP4]
DRC321e0682011-05-03 08:47:43 +0000888 bx lr
889
890 .unreq DCT_TABLE
891 .unreq COEF_BLOCK
892 .unreq OUTPUT_BUF
893 .unreq OUTPUT_COL
DRC4b024a62011-08-15 08:36:51 +0000894 .unreq TMP1
895 .unreq TMP2
896 .unreq TMP3
897 .unreq TMP4
DRC321e0682011-05-03 08:47:43 +0000898.endfunc
899
DRC3e00f032014-02-05 07:40:00 +0000900
DRC321e0682011-05-03 08:47:43 +0000901/*****************************************************************************/
902
903/*
DRC8c60d222011-06-17 21:12:58 +0000904 * jsimd_idct_4x4_neon
905 *
906 * This function contains inverse-DCT code for getting reduced-size
907 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
908 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
909 * function from jpeg-6b (jidctred.c).
910 *
911 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
912 * requires much less arithmetic operations and hence should be faster.
913 * The primary purpose of this particular NEON optimized function is
914 * bit exact compatibility with jpeg-6b.
915 *
916 * TODO: a bit better instructions scheduling can be achieved by expanding
917 * idct_helper/transpose_4x4 macros and reordering instructions,
918 * but readability will suffer somewhat.
919 */
920
921#define CONST_BITS 13
922
923#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
924#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
925#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
926#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
927#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
928#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
929#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
930#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
931#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
932#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
933#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
934#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
935#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
936#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
937
938.balign 16
939jsimd_idct_4x4_neon_consts:
940 .short FIX_1_847759065 /* d0[0] */
941 .short -FIX_0_765366865 /* d0[1] */
942 .short -FIX_0_211164243 /* d0[2] */
943 .short FIX_1_451774981 /* d0[3] */
944 .short -FIX_2_172734803 /* d1[0] */
945 .short FIX_1_061594337 /* d1[1] */
946 .short -FIX_0_509795579 /* d1[2] */
947 .short -FIX_0_601344887 /* d1[3] */
948 .short FIX_0_899976223 /* d2[0] */
949 .short FIX_2_562915447 /* d2[1] */
950 .short 1 << (CONST_BITS+1) /* d2[2] */
951 .short 0 /* d2[3] */
952
953.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
954 vmull.s16 q14, \x4, d2[2]
955 vmlal.s16 q14, \x8, d0[0]
956 vmlal.s16 q14, \x14, d0[1]
957
958 vmull.s16 q13, \x16, d1[2]
959 vmlal.s16 q13, \x12, d1[3]
960 vmlal.s16 q13, \x10, d2[0]
961 vmlal.s16 q13, \x6, d2[1]
962
963 vmull.s16 q15, \x4, d2[2]
964 vmlsl.s16 q15, \x8, d0[0]
965 vmlsl.s16 q15, \x14, d0[1]
966
967 vmull.s16 q12, \x16, d0[2]
968 vmlal.s16 q12, \x12, d0[3]
969 vmlal.s16 q12, \x10, d1[0]
970 vmlal.s16 q12, \x6, d1[1]
971
972 vadd.s32 q10, q14, q13
973 vsub.s32 q14, q14, q13
974
975.if \shift > 16
976 vrshr.s32 q10, q10, #\shift
977 vrshr.s32 q14, q14, #\shift
978 vmovn.s32 \y26, q10
979 vmovn.s32 \y29, q14
980.else
981 vrshrn.s32 \y26, q10, #\shift
982 vrshrn.s32 \y29, q14, #\shift
983.endif
984
985 vadd.s32 q10, q15, q12
986 vsub.s32 q15, q15, q12
987
988.if \shift > 16
989 vrshr.s32 q10, q10, #\shift
990 vrshr.s32 q15, q15, #\shift
991 vmovn.s32 \y27, q10
992 vmovn.s32 \y28, q15
993.else
994 vrshrn.s32 \y27, q10, #\shift
995 vrshrn.s32 \y28, q15, #\shift
996.endif
997
998.endm
999
1000asm_function jsimd_idct_4x4_neon
1001
1002 DCT_TABLE .req r0
1003 COEF_BLOCK .req r1
1004 OUTPUT_BUF .req r2
1005 OUTPUT_COL .req r3
1006 TMP1 .req r0
1007 TMP2 .req r1
1008 TMP3 .req r2
1009 TMP4 .req ip
1010
1011 vpush {d8-d15}
1012
1013 /* Load constants (d3 is just used for padding) */
1014 adr TMP4, jsimd_idct_4x4_neon_consts
1015 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1016
1017 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1018 * 0 1 2 3 | 4 5 6 7
1019 * ---------+--------
1020 * 0 | d4 | d5
1021 * 1 | d6 | d7
1022 * 2 | d8 | d9
1023 * 3 | d10 | d11
1024 * 4 | - | -
1025 * 5 | d12 | d13
1026 * 6 | d14 | d15
1027 * 7 | d16 | d17
1028 */
1029 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1030 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1031 add COEF_BLOCK, COEF_BLOCK, #16
1032 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1033 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1034 /* dequantize */
1035 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1036 vmul.s16 q2, q2, q9
1037 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1038 vmul.s16 q3, q3, q10
1039 vmul.s16 q4, q4, q11
1040 add DCT_TABLE, DCT_TABLE, #16
1041 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1042 vmul.s16 q5, q5, q12
1043 vmul.s16 q6, q6, q13
1044 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1045 vmul.s16 q7, q7, q14
1046 vmul.s16 q8, q8, q15
1047
1048 /* Pass 1 */
1049 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1050 transpose_4x4 d4, d6, d8, d10
1051 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1052 transpose_4x4 d5, d7, d9, d11
1053
1054 /* Pass 2 */
1055 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1056 transpose_4x4 d26, d27, d28, d29
1057
1058 /* Range limit */
1059 vmov.u16 q15, #0x80
1060 vadd.s16 q13, q13, q15
1061 vadd.s16 q14, q14, q15
1062 vqmovun.s16 d26, q13
1063 vqmovun.s16 d27, q14
1064
1065 /* Store results to the output buffer */
1066 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1067 add TMP1, TMP1, OUTPUT_COL
1068 add TMP2, TMP2, OUTPUT_COL
1069 add TMP3, TMP3, OUTPUT_COL
1070 add TMP4, TMP4, OUTPUT_COL
1071
1072#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1073 /* We can use much less instructions on little endian systems if the
1074 * OS kernel is not configured to trap unaligned memory accesses
1075 */
1076 vst1.32 {d26[0]}, [TMP1]!
1077 vst1.32 {d27[0]}, [TMP3]!
1078 vst1.32 {d26[1]}, [TMP2]!
1079 vst1.32 {d27[1]}, [TMP4]!
1080#else
1081 vst1.8 {d26[0]}, [TMP1]!
1082 vst1.8 {d27[0]}, [TMP3]!
1083 vst1.8 {d26[1]}, [TMP1]!
1084 vst1.8 {d27[1]}, [TMP3]!
1085 vst1.8 {d26[2]}, [TMP1]!
1086 vst1.8 {d27[2]}, [TMP3]!
1087 vst1.8 {d26[3]}, [TMP1]!
1088 vst1.8 {d27[3]}, [TMP3]!
1089
1090 vst1.8 {d26[4]}, [TMP2]!
1091 vst1.8 {d27[4]}, [TMP4]!
1092 vst1.8 {d26[5]}, [TMP2]!
1093 vst1.8 {d27[5]}, [TMP4]!
1094 vst1.8 {d26[6]}, [TMP2]!
1095 vst1.8 {d27[6]}, [TMP4]!
1096 vst1.8 {d26[7]}, [TMP2]!
1097 vst1.8 {d27[7]}, [TMP4]!
1098#endif
1099
1100 vpop {d8-d15}
1101 bx lr
1102
1103 .unreq DCT_TABLE
1104 .unreq COEF_BLOCK
1105 .unreq OUTPUT_BUF
1106 .unreq OUTPUT_COL
1107 .unreq TMP1
1108 .unreq TMP2
1109 .unreq TMP3
1110 .unreq TMP4
1111.endfunc
1112
1113.purgem idct_helper
1114
DRC3e00f032014-02-05 07:40:00 +00001115
DRC8c60d222011-06-17 21:12:58 +00001116/*****************************************************************************/
1117
1118/*
1119 * jsimd_idct_2x2_neon
1120 *
1121 * This function contains inverse-DCT code for getting reduced-size
1122 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1123 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1124 * function from jpeg-6b (jidctred.c).
1125 *
1126 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1127 * requires much less arithmetic operations and hence should be faster.
1128 * The primary purpose of this particular NEON optimized function is
1129 * bit exact compatibility with jpeg-6b.
1130 */
1131
1132.balign 8
1133jsimd_idct_2x2_neon_consts:
1134 .short -FIX_0_720959822 /* d0[0] */
1135 .short FIX_0_850430095 /* d0[1] */
1136 .short -FIX_1_272758580 /* d0[2] */
1137 .short FIX_3_624509785 /* d0[3] */
1138
1139.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1140 vshll.s16 q14, \x4, #15
1141 vmull.s16 q13, \x6, d0[3]
1142 vmlal.s16 q13, \x10, d0[2]
1143 vmlal.s16 q13, \x12, d0[1]
1144 vmlal.s16 q13, \x16, d0[0]
1145
1146 vadd.s32 q10, q14, q13
1147 vsub.s32 q14, q14, q13
1148
1149.if \shift > 16
1150 vrshr.s32 q10, q10, #\shift
1151 vrshr.s32 q14, q14, #\shift
1152 vmovn.s32 \y26, q10
1153 vmovn.s32 \y27, q14
1154.else
1155 vrshrn.s32 \y26, q10, #\shift
1156 vrshrn.s32 \y27, q14, #\shift
1157.endif
1158
1159.endm
1160
1161asm_function jsimd_idct_2x2_neon
1162
1163 DCT_TABLE .req r0
1164 COEF_BLOCK .req r1
1165 OUTPUT_BUF .req r2
1166 OUTPUT_COL .req r3
1167 TMP1 .req r0
1168 TMP2 .req ip
1169
1170 vpush {d8-d15}
1171
1172 /* Load constants */
1173 adr TMP2, jsimd_idct_2x2_neon_consts
1174 vld1.16 {d0}, [TMP2, :64]
1175
1176 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177 * 0 1 2 3 | 4 5 6 7
1178 * ---------+--------
1179 * 0 | d4 | d5
1180 * 1 | d6 | d7
1181 * 2 | - | -
1182 * 3 | d10 | d11
1183 * 4 | - | -
1184 * 5 | d12 | d13
1185 * 6 | - | -
1186 * 7 | d16 | d17
1187 */
1188 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189 add COEF_BLOCK, COEF_BLOCK, #16
1190 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1191 add COEF_BLOCK, COEF_BLOCK, #16
1192 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1193 add COEF_BLOCK, COEF_BLOCK, #16
1194 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1195 /* Dequantize */
1196 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197 vmul.s16 q2, q2, q9
1198 vmul.s16 q3, q3, q10
1199 add DCT_TABLE, DCT_TABLE, #16
1200 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1201 vmul.s16 q5, q5, q12
1202 add DCT_TABLE, DCT_TABLE, #16
1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1204 vmul.s16 q6, q6, q13
1205 add DCT_TABLE, DCT_TABLE, #16
1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1207 vmul.s16 q8, q8, q15
1208
1209 /* Pass 1 */
1210#if 0
1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1212 transpose_4x4 d4, d6, d8, d10
1213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1214 transpose_4x4 d5, d7, d9, d11
1215#else
1216 vmull.s16 q13, d6, d0[3]
1217 vmlal.s16 q13, d10, d0[2]
1218 vmlal.s16 q13, d12, d0[1]
1219 vmlal.s16 q13, d16, d0[0]
1220 vmull.s16 q12, d7, d0[3]
1221 vmlal.s16 q12, d11, d0[2]
1222 vmlal.s16 q12, d13, d0[1]
1223 vmlal.s16 q12, d17, d0[0]
1224 vshll.s16 q14, d4, #15
1225 vshll.s16 q15, d5, #15
1226 vadd.s32 q10, q14, q13
1227 vsub.s32 q14, q14, q13
1228 vrshrn.s32 d4, q10, #13
1229 vrshrn.s32 d6, q14, #13
1230 vadd.s32 q10, q15, q12
1231 vsub.s32 q14, q15, q12
1232 vrshrn.s32 d5, q10, #13
1233 vrshrn.s32 d7, q14, #13
1234 vtrn.16 q2, q3
1235 vtrn.32 q3, q5
1236#endif
1237
1238 /* Pass 2 */
1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1240
1241 /* Range limit */
1242 vmov.u16 q15, #0x80
1243 vadd.s16 q13, q13, q15
1244 vqmovun.s16 d26, q13
1245 vqmovun.s16 d27, q13
1246
1247 /* Store results to the output buffer */
1248 ldmia OUTPUT_BUF, {TMP1, TMP2}
1249 add TMP1, TMP1, OUTPUT_COL
1250 add TMP2, TMP2, OUTPUT_COL
1251
1252 vst1.8 {d26[0]}, [TMP1]!
1253 vst1.8 {d27[4]}, [TMP1]!
1254 vst1.8 {d26[1]}, [TMP2]!
1255 vst1.8 {d27[5]}, [TMP2]!
1256
1257 vpop {d8-d15}
1258 bx lr
1259
1260 .unreq DCT_TABLE
1261 .unreq COEF_BLOCK
1262 .unreq OUTPUT_BUF
1263 .unreq OUTPUT_COL
1264 .unreq TMP1
1265 .unreq TMP2
1266.endfunc
1267
1268.purgem idct_helper
1269
DRC3e00f032014-02-05 07:40:00 +00001270
DRC8c60d222011-06-17 21:12:58 +00001271/*****************************************************************************/
1272
1273/*
DRC321e0682011-05-03 08:47:43 +00001274 * jsimd_ycc_extrgb_convert_neon
1275 * jsimd_ycc_extbgr_convert_neon
1276 * jsimd_ycc_extrgbx_convert_neon
1277 * jsimd_ycc_extbgrx_convert_neon
1278 * jsimd_ycc_extxbgr_convert_neon
1279 * jsimd_ycc_extxrgb_convert_neon
1280 *
1281 * Colorspace conversion YCbCr -> RGB
1282 */
1283
DRC321e0682011-05-03 08:47:43 +00001284
1285.macro do_load size
DRC4346f912011-06-14 22:16:50 +00001286 .if \size == 8
DRC98a44fe2011-08-24 23:27:44 +00001287 vld1.8 {d4}, [U, :64]!
1288 vld1.8 {d5}, [V, :64]!
1289 vld1.8 {d0}, [Y, :64]!
DRC321e0682011-05-03 08:47:43 +00001290 pld [U, #64]
1291 pld [V, #64]
DRC98a44fe2011-08-24 23:27:44 +00001292 pld [Y, #64]
DRC4346f912011-06-14 22:16:50 +00001293 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001294 vld1.8 {d4[0]}, [U]!
1295 vld1.8 {d4[1]}, [U]!
1296 vld1.8 {d4[2]}, [U]!
1297 vld1.8 {d4[3]}, [U]!
1298 vld1.8 {d5[0]}, [V]!
1299 vld1.8 {d5[1]}, [V]!
1300 vld1.8 {d5[2]}, [V]!
1301 vld1.8 {d5[3]}, [V]!
1302 vld1.8 {d0[0]}, [Y]!
1303 vld1.8 {d0[1]}, [Y]!
1304 vld1.8 {d0[2]}, [Y]!
1305 vld1.8 {d0[3]}, [Y]!
DRC4346f912011-06-14 22:16:50 +00001306 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001307 vld1.8 {d4[4]}, [U]!
1308 vld1.8 {d4[5]}, [U]!
1309 vld1.8 {d5[4]}, [V]!
1310 vld1.8 {d5[5]}, [V]!
1311 vld1.8 {d0[4]}, [Y]!
1312 vld1.8 {d0[5]}, [Y]!
DRC4346f912011-06-14 22:16:50 +00001313 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001314 vld1.8 {d4[6]}, [U]!
1315 vld1.8 {d5[6]}, [V]!
1316 vld1.8 {d0[6]}, [Y]!
1317 .else
1318 .error unsupported macroblock size
1319 .endif
1320.endm
1321
1322.macro do_store bpp, size
DRC4346f912011-06-14 22:16:50 +00001323 .if \bpp == 24
1324 .if \size == 8
DRC321e0682011-05-03 08:47:43 +00001325 vst3.8 {d10, d11, d12}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001326 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001327 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1328 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1329 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1330 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001331 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001332 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1333 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001334 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001335 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1336 .else
1337 .error unsupported macroblock size
1338 .endif
DRC4346f912011-06-14 22:16:50 +00001339 .elseif \bpp == 32
1340 .if \size == 8
DRC321e0682011-05-03 08:47:43 +00001341 vst4.8 {d10, d11, d12, d13}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001342 .elseif \size == 4
DRC321e0682011-05-03 08:47:43 +00001343 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1344 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1345 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1346 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001347 .elseif \size == 2
DRC321e0682011-05-03 08:47:43 +00001348 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1349 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
DRC4346f912011-06-14 22:16:50 +00001350 .elseif \size == 1
DRC321e0682011-05-03 08:47:43 +00001351 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1352 .else
1353 .error unsupported macroblock size
1354 .endif
1355 .else
1356 .error unsupported bpp
1357 .endif
1358.endm
1359
1360.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1361
DRC98a44fe2011-08-24 23:27:44 +00001362/*
1363 * 2 stage pipelined YCbCr->RGB conversion
1364 */
1365
1366.macro do_yuv_to_rgb_stage1
DRC321e0682011-05-03 08:47:43 +00001367 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1368 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1369 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1370 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1371 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1372 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1373 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1374 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1375 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1376 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
DRC98a44fe2011-08-24 23:27:44 +00001377.endm
1378
1379.macro do_yuv_to_rgb_stage2
DRC321e0682011-05-03 08:47:43 +00001380 vrshrn.s32 d20, q10, #15
1381 vrshrn.s32 d21, q11, #15
1382 vrshrn.s32 d24, q12, #14
1383 vrshrn.s32 d25, q13, #14
1384 vrshrn.s32 d28, q14, #14
1385 vrshrn.s32 d29, q15, #14
1386 vaddw.u8 q10, q10, d0
1387 vaddw.u8 q12, q12, d0
1388 vaddw.u8 q14, q14, d0
DRC4346f912011-06-14 22:16:50 +00001389 vqmovun.s16 d1\g_offs, q10
1390 vqmovun.s16 d1\r_offs, q12
1391 vqmovun.s16 d1\b_offs, q14
DRC321e0682011-05-03 08:47:43 +00001392.endm
1393
DRC98a44fe2011-08-24 23:27:44 +00001394.macro do_yuv_to_rgb_stage2_store_load_stage1
1395 vld1.8 {d4}, [U, :64]!
1396 vrshrn.s32 d20, q10, #15
1397 vrshrn.s32 d21, q11, #15
1398 vrshrn.s32 d24, q12, #14
1399 vrshrn.s32 d25, q13, #14
1400 vrshrn.s32 d28, q14, #14
1401 vld1.8 {d5}, [V, :64]!
1402 vrshrn.s32 d29, q15, #14
1403 vaddw.u8 q10, q10, d0
1404 vaddw.u8 q12, q12, d0
1405 vaddw.u8 q14, q14, d0
1406 vqmovun.s16 d1\g_offs, q10
1407 vld1.8 {d0}, [Y, :64]!
1408 vqmovun.s16 d1\r_offs, q12
1409 pld [U, #64]
1410 pld [V, #64]
1411 pld [Y, #64]
1412 vqmovun.s16 d1\b_offs, q14
1413 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1414 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1415 do_store \bpp, 8
1416 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1417 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1418 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1419 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1420 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1421 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1422 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1423 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1424.endm
1425
1426.macro do_yuv_to_rgb
1427 do_yuv_to_rgb_stage1
1428 do_yuv_to_rgb_stage2
1429.endm
1430
DRC4346f912011-06-14 22:16:50 +00001431/* Apple gas crashes on adrl, work around that by using adr.
1432 * But this requires a copy of these constants for each function.
1433 */
1434
1435.balign 16
1436jsimd_ycc_\colorid\()_neon_consts:
1437 .short 0, 0, 0, 0
1438 .short 22971, -11277, -23401, 29033
1439 .short -128, -128, -128, -128
1440 .short -128, -128, -128, -128
1441
1442asm_function jsimd_ycc_\colorid\()_convert_neon
DRC321e0682011-05-03 08:47:43 +00001443 OUTPUT_WIDTH .req r0
1444 INPUT_BUF .req r1
1445 INPUT_ROW .req r2
1446 OUTPUT_BUF .req r3
1447 NUM_ROWS .req r4
1448
1449 INPUT_BUF0 .req r5
1450 INPUT_BUF1 .req r6
1451 INPUT_BUF2 .req INPUT_BUF
1452
1453 RGB .req r7
1454 Y .req r8
1455 U .req r9
1456 V .req r10
1457 N .req ip
1458
1459 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
DRC4346f912011-06-14 22:16:50 +00001460 adr ip, jsimd_ycc_\colorid\()_neon_consts
DRC321e0682011-05-03 08:47:43 +00001461 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1462
1463 /* Save ARM registers and handle input arguments */
1464 push {r4, r5, r6, r7, r8, r9, r10, lr}
1465 ldr NUM_ROWS, [sp, #(4 * 8)]
1466 ldr INPUT_BUF0, [INPUT_BUF]
1467 ldr INPUT_BUF1, [INPUT_BUF, #4]
1468 ldr INPUT_BUF2, [INPUT_BUF, #8]
1469 .unreq INPUT_BUF
1470
1471 /* Save NEON registers */
1472 vpush {d8-d15}
1473
1474 /* Initially set d10, d11, d12, d13 to 0xFF */
1475 vmov.u8 q5, #255
1476 vmov.u8 q6, #255
1477
1478 /* Outer loop over scanlines */
1479 cmp NUM_ROWS, #1
1480 blt 9f
14810:
1482 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1483 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1484 mov N, OUTPUT_WIDTH
1485 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1486 add INPUT_ROW, INPUT_ROW, #1
1487 ldr RGB, [OUTPUT_BUF], #4
1488
1489 /* Inner loop over pixels */
1490 subs N, N, #8
DRC98a44fe2011-08-24 23:27:44 +00001491 blt 3f
1492 do_load 8
1493 do_yuv_to_rgb_stage1
1494 subs N, N, #8
DRC321e0682011-05-03 08:47:43 +00001495 blt 2f
14961:
DRC98a44fe2011-08-24 23:27:44 +00001497 do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001498 subs N, N, #8
1499 bge 1b
DRC98a44fe2011-08-24 23:27:44 +000015002:
1501 do_yuv_to_rgb_stage2
1502 do_store \bpp, 8
DRC321e0682011-05-03 08:47:43 +00001503 tst N, #7
1504 beq 8f
DRC98a44fe2011-08-24 23:27:44 +000015053:
DRC321e0682011-05-03 08:47:43 +00001506 tst N, #4
1507 beq 3f
1508 do_load 4
15093:
1510 tst N, #2
1511 beq 4f
1512 do_load 2
15134:
1514 tst N, #1
1515 beq 5f
1516 do_load 1
15175:
1518 do_yuv_to_rgb
1519 tst N, #4
1520 beq 6f
DRC4346f912011-06-14 22:16:50 +00001521 do_store \bpp, 4
DRC321e0682011-05-03 08:47:43 +000015226:
1523 tst N, #2
1524 beq 7f
DRC4346f912011-06-14 22:16:50 +00001525 do_store \bpp, 2
DRC321e0682011-05-03 08:47:43 +000015267:
1527 tst N, #1
1528 beq 8f
DRC4346f912011-06-14 22:16:50 +00001529 do_store \bpp, 1
DRC321e0682011-05-03 08:47:43 +000015308:
1531 subs NUM_ROWS, NUM_ROWS, #1
1532 bgt 0b
15339:
1534 /* Restore all registers and return */
1535 vpop {d8-d15}
1536 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1537
1538 .unreq OUTPUT_WIDTH
1539 .unreq INPUT_ROW
1540 .unreq OUTPUT_BUF
1541 .unreq NUM_ROWS
1542 .unreq INPUT_BUF0
1543 .unreq INPUT_BUF1
1544 .unreq INPUT_BUF2
1545 .unreq RGB
1546 .unreq Y
1547 .unreq U
1548 .unreq V
1549 .unreq N
1550.endfunc
1551
1552.purgem do_yuv_to_rgb
DRC98a44fe2011-08-24 23:27:44 +00001553.purgem do_yuv_to_rgb_stage1
1554.purgem do_yuv_to_rgb_stage2
1555.purgem do_yuv_to_rgb_stage2_store_load_stage1
DRC321e0682011-05-03 08:47:43 +00001556
1557.endm
1558
1559/*--------------------------------- id ----- bpp R G B */
1560generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1561generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1562generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1563generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1564generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1565generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1566
1567.purgem do_load
1568.purgem do_store
1569
DRC3e00f032014-02-05 07:40:00 +00001570
DRC321e0682011-05-03 08:47:43 +00001571/*****************************************************************************/
DRCb7400542011-08-10 23:31:13 +00001572
1573/*
DRC7a9376c2011-08-12 19:27:20 +00001574 * jsimd_extrgb_ycc_convert_neon
1575 * jsimd_extbgr_ycc_convert_neon
1576 * jsimd_extrgbx_ycc_convert_neon
1577 * jsimd_extbgrx_ycc_convert_neon
1578 * jsimd_extxbgr_ycc_convert_neon
1579 * jsimd_extxrgb_ycc_convert_neon
1580 *
1581 * Colorspace conversion RGB -> YCbCr
1582 */
1583
1584.macro do_store size
1585 .if \size == 8
1586 vst1.8 {d20}, [Y]!
1587 vst1.8 {d21}, [U]!
1588 vst1.8 {d22}, [V]!
1589 .elseif \size == 4
1590 vst1.8 {d20[0]}, [Y]!
1591 vst1.8 {d20[1]}, [Y]!
1592 vst1.8 {d20[2]}, [Y]!
1593 vst1.8 {d20[3]}, [Y]!
1594 vst1.8 {d21[0]}, [U]!
1595 vst1.8 {d21[1]}, [U]!
1596 vst1.8 {d21[2]}, [U]!
1597 vst1.8 {d21[3]}, [U]!
1598 vst1.8 {d22[0]}, [V]!
1599 vst1.8 {d22[1]}, [V]!
1600 vst1.8 {d22[2]}, [V]!
1601 vst1.8 {d22[3]}, [V]!
1602 .elseif \size == 2
1603 vst1.8 {d20[4]}, [Y]!
1604 vst1.8 {d20[5]}, [Y]!
1605 vst1.8 {d21[4]}, [U]!
1606 vst1.8 {d21[5]}, [U]!
1607 vst1.8 {d22[4]}, [V]!
1608 vst1.8 {d22[5]}, [V]!
1609 .elseif \size == 1
1610 vst1.8 {d20[6]}, [Y]!
1611 vst1.8 {d21[6]}, [U]!
1612 vst1.8 {d22[6]}, [V]!
1613 .else
1614 .error unsupported macroblock size
1615 .endif
1616.endm
1617
1618.macro do_load bpp, size
1619 .if \bpp == 24
1620 .if \size == 8
1621 vld3.8 {d10, d11, d12}, [RGB]!
1622 pld [RGB, #128]
1623 .elseif \size == 4
1624 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1625 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1626 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1627 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1628 .elseif \size == 2
1629 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1630 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1631 .elseif \size == 1
1632 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1633 .else
1634 .error unsupported macroblock size
1635 .endif
1636 .elseif \bpp == 32
1637 .if \size == 8
1638 vld4.8 {d10, d11, d12, d13}, [RGB]!
1639 pld [RGB, #128]
1640 .elseif \size == 4
1641 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1642 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1643 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1644 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1645 .elseif \size == 2
1646 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1647 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1648 .elseif \size == 1
1649 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1650 .else
1651 .error unsupported macroblock size
1652 .endif
1653 .else
1654 .error unsupported bpp
1655 .endif
1656.endm
1657
1658.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1659
1660/*
1661 * 2 stage pipelined RGB->YCbCr conversion
1662 */
1663
1664.macro do_rgb_to_yuv_stage1
1665 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1666 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1667 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1668 vmull.u16 q7, d4, d0[0]
1669 vmlal.u16 q7, d6, d0[1]
1670 vmlal.u16 q7, d8, d0[2]
1671 vmull.u16 q8, d5, d0[0]
1672 vmlal.u16 q8, d7, d0[1]
1673 vmlal.u16 q8, d9, d0[2]
1674 vrev64.32 q9, q1
1675 vrev64.32 q13, q1
1676 vmlsl.u16 q9, d4, d0[3]
1677 vmlsl.u16 q9, d6, d1[0]
1678 vmlal.u16 q9, d8, d1[1]
1679 vmlsl.u16 q13, d5, d0[3]
1680 vmlsl.u16 q13, d7, d1[0]
1681 vmlal.u16 q13, d9, d1[1]
1682 vrev64.32 q14, q1
1683 vrev64.32 q15, q1
1684 vmlal.u16 q14, d4, d1[1]
1685 vmlsl.u16 q14, d6, d1[2]
1686 vmlsl.u16 q14, d8, d1[3]
1687 vmlal.u16 q15, d5, d1[1]
1688 vmlsl.u16 q15, d7, d1[2]
1689 vmlsl.u16 q15, d9, d1[3]
1690.endm
1691
1692.macro do_rgb_to_yuv_stage2
1693 vrshrn.u32 d20, q7, #16
1694 vrshrn.u32 d21, q8, #16
1695 vshrn.u32 d22, q9, #16
1696 vshrn.u32 d23, q13, #16
1697 vshrn.u32 d24, q14, #16
1698 vshrn.u32 d25, q15, #16
1699 vmovn.u16 d20, q10 /* d20 = y */
1700 vmovn.u16 d21, q11 /* d21 = u */
1701 vmovn.u16 d22, q12 /* d22 = v */
1702.endm
1703
1704.macro do_rgb_to_yuv
1705 do_rgb_to_yuv_stage1
1706 do_rgb_to_yuv_stage2
1707.endm
1708
1709.macro do_rgb_to_yuv_stage2_store_load_stage1
1710 vrshrn.u32 d20, q7, #16
1711 vrshrn.u32 d21, q8, #16
1712 vshrn.u32 d22, q9, #16
1713 vrev64.32 q9, q1
1714 vshrn.u32 d23, q13, #16
1715 vrev64.32 q13, q1
1716 vshrn.u32 d24, q14, #16
1717 vshrn.u32 d25, q15, #16
1718 do_load \bpp, 8
1719 vmovn.u16 d20, q10 /* d20 = y */
1720 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1721 vmovn.u16 d21, q11 /* d21 = u */
1722 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1723 vmovn.u16 d22, q12 /* d22 = v */
1724 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1725 vmull.u16 q7, d4, d0[0]
1726 vmlal.u16 q7, d6, d0[1]
1727 vmlal.u16 q7, d8, d0[2]
1728 vst1.8 {d20}, [Y]!
1729 vmull.u16 q8, d5, d0[0]
1730 vmlal.u16 q8, d7, d0[1]
1731 vmlal.u16 q8, d9, d0[2]
1732 vmlsl.u16 q9, d4, d0[3]
1733 vmlsl.u16 q9, d6, d1[0]
1734 vmlal.u16 q9, d8, d1[1]
1735 vst1.8 {d21}, [U]!
1736 vmlsl.u16 q13, d5, d0[3]
1737 vmlsl.u16 q13, d7, d1[0]
1738 vmlal.u16 q13, d9, d1[1]
1739 vrev64.32 q14, q1
1740 vrev64.32 q15, q1
1741 vmlal.u16 q14, d4, d1[1]
1742 vmlsl.u16 q14, d6, d1[2]
1743 vmlsl.u16 q14, d8, d1[3]
1744 vst1.8 {d22}, [V]!
1745 vmlal.u16 q15, d5, d1[1]
1746 vmlsl.u16 q15, d7, d1[2]
1747 vmlsl.u16 q15, d9, d1[3]
1748.endm
1749
1750.balign 16
1751jsimd_\colorid\()_ycc_neon_consts:
1752 .short 19595, 38470, 7471, 11059
1753 .short 21709, 32768, 27439, 5329
1754 .short 32767, 128, 32767, 128
1755 .short 32767, 128, 32767, 128
1756
1757asm_function jsimd_\colorid\()_ycc_convert_neon
1758 OUTPUT_WIDTH .req r0
1759 INPUT_BUF .req r1
1760 OUTPUT_BUF .req r2
1761 OUTPUT_ROW .req r3
1762 NUM_ROWS .req r4
1763
1764 OUTPUT_BUF0 .req r5
1765 OUTPUT_BUF1 .req r6
1766 OUTPUT_BUF2 .req OUTPUT_BUF
1767
1768 RGB .req r7
1769 Y .req r8
1770 U .req r9
1771 V .req r10
1772 N .req ip
1773
1774 /* Load constants to d0, d1, d2, d3 */
1775 adr ip, jsimd_\colorid\()_ycc_neon_consts
1776 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1777
1778 /* Save ARM registers and handle input arguments */
1779 push {r4, r5, r6, r7, r8, r9, r10, lr}
1780 ldr NUM_ROWS, [sp, #(4 * 8)]
1781 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1782 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1783 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1784 .unreq OUTPUT_BUF
1785
1786 /* Save NEON registers */
1787 vpush {d8-d15}
1788
1789 /* Outer loop over scanlines */
1790 cmp NUM_ROWS, #1
1791 blt 9f
17920:
1793 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1794 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1795 mov N, OUTPUT_WIDTH
1796 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1797 add OUTPUT_ROW, OUTPUT_ROW, #1
1798 ldr RGB, [INPUT_BUF], #4
1799
1800 /* Inner loop over pixels */
1801 subs N, N, #8
1802 blt 3f
1803 do_load \bpp, 8
1804 do_rgb_to_yuv_stage1
1805 subs N, N, #8
1806 blt 2f
18071:
1808 do_rgb_to_yuv_stage2_store_load_stage1
1809 subs N, N, #8
1810 bge 1b
18112:
1812 do_rgb_to_yuv_stage2
1813 do_store 8
1814 tst N, #7
1815 beq 8f
18163:
1817 tst N, #4
1818 beq 3f
1819 do_load \bpp, 4
18203:
1821 tst N, #2
1822 beq 4f
1823 do_load \bpp, 2
18244:
1825 tst N, #1
1826 beq 5f
1827 do_load \bpp, 1
18285:
1829 do_rgb_to_yuv
1830 tst N, #4
1831 beq 6f
1832 do_store 4
18336:
1834 tst N, #2
1835 beq 7f
1836 do_store 2
18377:
1838 tst N, #1
1839 beq 8f
1840 do_store 1
18418:
1842 subs NUM_ROWS, NUM_ROWS, #1
1843 bgt 0b
18449:
1845 /* Restore all registers and return */
1846 vpop {d8-d15}
1847 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1848
1849 .unreq OUTPUT_WIDTH
1850 .unreq OUTPUT_ROW
1851 .unreq INPUT_BUF
1852 .unreq NUM_ROWS
1853 .unreq OUTPUT_BUF0
1854 .unreq OUTPUT_BUF1
1855 .unreq OUTPUT_BUF2
1856 .unreq RGB
1857 .unreq Y
1858 .unreq U
1859 .unreq V
1860 .unreq N
1861.endfunc
1862
1863.purgem do_rgb_to_yuv
1864.purgem do_rgb_to_yuv_stage1
1865.purgem do_rgb_to_yuv_stage2
1866.purgem do_rgb_to_yuv_stage2_store_load_stage1
1867
1868.endm
1869
1870/*--------------------------------- id ----- bpp R G B */
1871generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1872generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1873generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1874generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1875generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1876generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1877
1878.purgem do_load
1879.purgem do_store
1880
DRC3e00f032014-02-05 07:40:00 +00001881
DRC7a9376c2011-08-12 19:27:20 +00001882/*****************************************************************************/
1883
1884/*
DRCb7400542011-08-10 23:31:13 +00001885 * Load data into workspace, applying unsigned->signed conversion
1886 *
1887 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1888 * rid of VST1.16 instructions
1889 */
1890
1891asm_function jsimd_convsamp_neon
1892 SAMPLE_DATA .req r0
1893 START_COL .req r1
1894 WORKSPACE .req r2
1895 TMP1 .req r3
1896 TMP2 .req r4
1897 TMP3 .req r5
1898 TMP4 .req ip
1899
1900 push {r4, r5}
1901 vmov.u8 d0, #128
1902
1903 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1904 add TMP1, TMP1, START_COL
1905 add TMP2, TMP2, START_COL
1906 add TMP3, TMP3, START_COL
1907 add TMP4, TMP4, START_COL
1908 vld1.8 {d16}, [TMP1]
1909 vsubl.u8 q8, d16, d0
1910 vld1.8 {d18}, [TMP2]
1911 vsubl.u8 q9, d18, d0
1912 vld1.8 {d20}, [TMP3]
1913 vsubl.u8 q10, d20, d0
1914 vld1.8 {d22}, [TMP4]
1915 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1916 vsubl.u8 q11, d22, d0
1917 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1918 add TMP1, TMP1, START_COL
1919 add TMP2, TMP2, START_COL
1920 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1921 add TMP3, TMP3, START_COL
1922 add TMP4, TMP4, START_COL
1923 vld1.8 {d24}, [TMP1]
1924 vsubl.u8 q12, d24, d0
1925 vld1.8 {d26}, [TMP2]
1926 vsubl.u8 q13, d26, d0
1927 vld1.8 {d28}, [TMP3]
1928 vsubl.u8 q14, d28, d0
1929 vld1.8 {d30}, [TMP4]
1930 vsubl.u8 q15, d30, d0
1931 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1932 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1933 pop {r4, r5}
1934 bx lr
1935
1936 .unreq SAMPLE_DATA
1937 .unreq START_COL
1938 .unreq WORKSPACE
1939 .unreq TMP1
1940 .unreq TMP2
1941 .unreq TMP3
1942 .unreq TMP4
1943.endfunc
1944
DRC3e00f032014-02-05 07:40:00 +00001945
DRCb7400542011-08-10 23:31:13 +00001946/*****************************************************************************/
1947
1948/*
1949 * jsimd_fdct_ifast_neon
1950 *
1951 * This function contains a fast, not so accurate integer implementation of
1952 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1953 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1954 * function from jfdctfst.c
1955 *
1956 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1957 * rid of a bunch of VLD1.16 instructions
1958 */
1959
1960#define XFIX_0_382683433 d0[0]
1961#define XFIX_0_541196100 d0[1]
1962#define XFIX_0_707106781 d0[2]
1963#define XFIX_1_306562965 d0[3]
1964
1965.balign 16
1966jsimd_fdct_ifast_neon_consts:
1967 .short (98 * 128) /* XFIX_0_382683433 */
1968 .short (139 * 128) /* XFIX_0_541196100 */
1969 .short (181 * 128) /* XFIX_0_707106781 */
1970 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1971
1972asm_function jsimd_fdct_ifast_neon
1973
1974 DATA .req r0
1975 TMP .req ip
1976
1977 vpush {d8-d15}
1978
1979 /* Load constants */
1980 adr TMP, jsimd_fdct_ifast_neon_consts
1981 vld1.16 {d0}, [TMP, :64]
1982
1983 /* Load all DATA into NEON registers with the following allocation:
1984 * 0 1 2 3 | 4 5 6 7
1985 * ---------+--------
1986 * 0 | d16 | d17 | q8
1987 * 1 | d18 | d19 | q9
1988 * 2 | d20 | d21 | q10
1989 * 3 | d22 | d23 | q11
1990 * 4 | d24 | d25 | q12
1991 * 5 | d26 | d27 | q13
1992 * 6 | d28 | d29 | q14
1993 * 7 | d30 | d31 | q15
1994 */
1995
1996 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
1997 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
1998 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
1999 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2000 sub DATA, DATA, #(128 - 32)
2001
2002 mov TMP, #2
20031:
2004 /* Transpose */
2005 vtrn.16 q12, q13
2006 vtrn.16 q10, q11
2007 vtrn.16 q8, q9
2008 vtrn.16 q14, q15
2009 vtrn.32 q9, q11
2010 vtrn.32 q13, q15
2011 vtrn.32 q8, q10
2012 vtrn.32 q12, q14
2013 vswp d30, d23
2014 vswp d24, d17
2015 vswp d26, d19
2016 /* 1-D FDCT */
2017 vadd.s16 q2, q11, q12
2018 vswp d28, d21
2019 vsub.s16 q12, q11, q12
2020 vsub.s16 q6, q10, q13
2021 vadd.s16 q10, q10, q13
2022 vsub.s16 q7, q9, q14
2023 vadd.s16 q9, q9, q14
2024 vsub.s16 q1, q8, q15
2025 vadd.s16 q8, q8, q15
2026 vsub.s16 q4, q9, q10
2027 vsub.s16 q5, q8, q2
2028 vadd.s16 q3, q9, q10
2029 vadd.s16 q4, q4, q5
2030 vadd.s16 q2, q8, q2
2031 vqdmulh.s16 q4, q4, XFIX_0_707106781
2032 vadd.s16 q11, q12, q6
2033 vadd.s16 q8, q2, q3
2034 vsub.s16 q12, q2, q3
2035 vadd.s16 q3, q6, q7
2036 vadd.s16 q7, q7, q1
2037 vqdmulh.s16 q3, q3, XFIX_0_707106781
2038 vsub.s16 q6, q11, q7
2039 vadd.s16 q10, q5, q4
2040 vqdmulh.s16 q6, q6, XFIX_0_382683433
2041 vsub.s16 q14, q5, q4
2042 vqdmulh.s16 q11, q11, XFIX_0_541196100
2043 vqdmulh.s16 q5, q7, XFIX_1_306562965
2044 vadd.s16 q4, q1, q3
2045 vsub.s16 q3, q1, q3
2046 vadd.s16 q7, q7, q6
2047 vadd.s16 q11, q11, q6
2048 vadd.s16 q7, q7, q5
2049 vadd.s16 q13, q3, q11
2050 vsub.s16 q11, q3, q11
2051 vadd.s16 q9, q4, q7
2052 vsub.s16 q15, q4, q7
2053 subs TMP, TMP, #1
2054 bne 1b
2055
2056 /* store results */
2057 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2058 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2059 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2060 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2061
2062 vpop {d8-d15}
2063 bx lr
2064
2065 .unreq DATA
2066 .unreq TMP
2067.endfunc
2068
DRC3e00f032014-02-05 07:40:00 +00002069
DRCb7400542011-08-10 23:31:13 +00002070/*****************************************************************************/
DRC82bd5212011-08-17 21:00:59 +00002071
2072/*
2073 * GLOBAL(void)
2074 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2075 * DCTELEM * workspace);
2076 *
2077 * Note: the code uses 2 stage pipelining in order to improve instructions
2078 * scheduling and eliminate stalls (this provides ~15% better
2079 * performance for this function on both ARM Cortex-A8 and
2080 * ARM Cortex-A9 when compared to the non-pipelined variant).
2081 * The instructions which belong to the second stage use different
2082 * indentation for better readiability.
2083 */
2084asm_function jsimd_quantize_neon
2085
2086 COEF_BLOCK .req r0
2087 DIVISORS .req r1
2088 WORKSPACE .req r2
2089
2090 RECIPROCAL .req DIVISORS
2091 CORRECTION .req r3
2092 SHIFT .req ip
2093 LOOP_COUNT .req r4
2094
2095 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2096 vabs.s16 q12, q0
2097 add CORRECTION, DIVISORS, #(64 * 2)
2098 add SHIFT, DIVISORS, #(64 * 6)
2099 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2100 vabs.s16 q13, q1
2101 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2102 vadd.u16 q12, q12, q10 /* add correction */
2103 vadd.u16 q13, q13, q11
2104 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2105 vmull.u16 q11, d25, d17
2106 vmull.u16 q8, d26, d18
2107 vmull.u16 q9, d27, d19
2108 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2109 vshrn.u32 d20, q10, #16
2110 vshrn.u32 d21, q11, #16
2111 vshrn.u32 d22, q8, #16
2112 vshrn.u32 d23, q9, #16
2113 vneg.s16 q12, q12
2114 vneg.s16 q13, q13
2115 vshr.s16 q2, q0, #15 /* extract sign */
2116 vshr.s16 q3, q1, #15
2117 vshl.u16 q14, q10, q12 /* shift */
2118 vshl.u16 q15, q11, q13
2119
2120 push {r4, r5}
2121 mov LOOP_COUNT, #3
21221:
2123 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2124 veor.u16 q14, q14, q2 /* restore sign */
2125 vabs.s16 q12, q0
2126 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2127 vabs.s16 q13, q1
2128 veor.u16 q15, q15, q3
2129 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2130 vadd.u16 q12, q12, q10 /* add correction */
2131 vadd.u16 q13, q13, q11
2132 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2133 vmull.u16 q11, d25, d17
2134 vmull.u16 q8, d26, d18
2135 vmull.u16 q9, d27, d19
2136 vsub.u16 q14, q14, q2
2137 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2138 vsub.u16 q15, q15, q3
2139 vshrn.u32 d20, q10, #16
2140 vshrn.u32 d21, q11, #16
2141 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2142 vshrn.u32 d22, q8, #16
2143 vshrn.u32 d23, q9, #16
2144 vneg.s16 q12, q12
2145 vneg.s16 q13, q13
2146 vshr.s16 q2, q0, #15 /* extract sign */
2147 vshr.s16 q3, q1, #15
2148 vshl.u16 q14, q10, q12 /* shift */
2149 vshl.u16 q15, q11, q13
2150 subs LOOP_COUNT, LOOP_COUNT, #1
2151 bne 1b
2152 pop {r4, r5}
2153
2154 veor.u16 q14, q14, q2 /* restore sign */
2155 veor.u16 q15, q15, q3
2156 vsub.u16 q14, q14, q2
2157 vsub.u16 q15, q15, q3
2158 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2159
2160 bx lr /* return */
2161
2162 .unreq COEF_BLOCK
2163 .unreq DIVISORS
2164 .unreq WORKSPACE
2165 .unreq RECIPROCAL
2166 .unreq CORRECTION
2167 .unreq SHIFT
2168 .unreq LOOP_COUNT
2169.endfunc
DRC316617f2012-06-13 05:17:03 +00002170
DRC3e00f032014-02-05 07:40:00 +00002171
DRC316617f2012-06-13 05:17:03 +00002172/*****************************************************************************/
2173
2174/*
2175 * GLOBAL(void)
2176 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2177 * JDIMENSION downsampled_width,
2178 * JSAMPARRAY input_data,
2179 * JSAMPARRAY * output_data_ptr);
2180 *
2181 * Note: the use of unaligned writes is the main remaining bottleneck in
2182 * this code, which can be potentially solved to get up to tens
2183 * of percents performance improvement on Cortex-A8/Cortex-A9.
2184 */
2185
2186/*
2187 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2188 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2189 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2190 * Register d28 is used for multiplication by 3. Register q15 is used
2191 * for adding +1 bias.
2192 */
2193.macro upsample16 OUTPTR, INPTR
2194 vld1.8 {q0}, [\INPTR]!
2195 vmovl.u8 q8, d0
2196 vext.8 q2, q1, q0, #15
2197 vmovl.u8 q9, d1
2198 vaddw.u8 q10, q15, d4
2199 vaddw.u8 q11, q15, d5
2200 vmlal.u8 q8, d4, d28
2201 vmlal.u8 q9, d5, d28
2202 vmlal.u8 q10, d0, d28
2203 vmlal.u8 q11, d1, d28
2204 vmov q1, q0 /* backup source pixels to q1 */
2205 vrshrn.u16 d6, q8, #2
2206 vrshrn.u16 d7, q9, #2
2207 vshrn.u16 d8, q10, #2
2208 vshrn.u16 d9, q11, #2
2209 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2210.endm
2211
2212/*
2213 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2214 * macro, the roles of q0 and q1 registers are reversed for even and odd
2215 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2216 * Also this unrolling allows to reorder loads and stores to compensate
2217 * multiplication latency and reduce stalls.
2218 */
2219.macro upsample32 OUTPTR, INPTR
2220 /* even 16 pixels group */
2221 vld1.8 {q0}, [\INPTR]!
2222 vmovl.u8 q8, d0
2223 vext.8 q2, q1, q0, #15
2224 vmovl.u8 q9, d1
2225 vaddw.u8 q10, q15, d4
2226 vaddw.u8 q11, q15, d5
2227 vmlal.u8 q8, d4, d28
2228 vmlal.u8 q9, d5, d28
2229 vmlal.u8 q10, d0, d28
2230 vmlal.u8 q11, d1, d28
2231 /* odd 16 pixels group */
2232 vld1.8 {q1}, [\INPTR]!
2233 vrshrn.u16 d6, q8, #2
2234 vrshrn.u16 d7, q9, #2
2235 vshrn.u16 d8, q10, #2
2236 vshrn.u16 d9, q11, #2
2237 vmovl.u8 q8, d2
2238 vext.8 q2, q0, q1, #15
2239 vmovl.u8 q9, d3
2240 vaddw.u8 q10, q15, d4
2241 vaddw.u8 q11, q15, d5
2242 vmlal.u8 q8, d4, d28
2243 vmlal.u8 q9, d5, d28
2244 vmlal.u8 q10, d2, d28
2245 vmlal.u8 q11, d3, d28
2246 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2247 vrshrn.u16 d6, q8, #2
2248 vrshrn.u16 d7, q9, #2
2249 vshrn.u16 d8, q10, #2
2250 vshrn.u16 d9, q11, #2
2251 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2252.endm
2253
2254/*
2255 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2256 */
2257.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2258 /* special case for the first and last pixels */
2259 sub \WIDTH, \WIDTH, #1
2260 add \OUTPTR, \OUTPTR, #1
2261 ldrb \TMP1, [\INPTR, \WIDTH]
2262 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2263 ldrb \TMP1, [\INPTR], #1
2264 strb \TMP1, [\OUTPTR, #-1]
2265 vmov.8 d3[7], \TMP1
2266
2267 subs \WIDTH, \WIDTH, #32
2268 blt 5f
22690: /* process 32 pixels per iteration */
2270 upsample32 \OUTPTR, \INPTR
2271 subs \WIDTH, \WIDTH, #32
2272 bge 0b
22735:
2274 adds \WIDTH, \WIDTH, #16
2275 blt 1f
22760: /* process 16 pixels if needed */
2277 upsample16 \OUTPTR, \INPTR
2278 subs \WIDTH, \WIDTH, #16
22791:
2280 adds \WIDTH, \WIDTH, #16
2281 beq 9f
2282
2283 /* load the remaining 1-15 pixels */
2284 add \INPTR, \INPTR, \WIDTH
2285 tst \WIDTH, #1
2286 beq 2f
2287 sub \INPTR, \INPTR, #1
2288 vld1.8 {d0[0]}, [\INPTR]
22892:
2290 tst \WIDTH, #2
2291 beq 2f
2292 vext.8 d0, d0, d0, #6
2293 sub \INPTR, \INPTR, #1
2294 vld1.8 {d0[1]}, [\INPTR]
2295 sub \INPTR, \INPTR, #1
2296 vld1.8 {d0[0]}, [\INPTR]
22972:
2298 tst \WIDTH, #4
2299 beq 2f
2300 vrev64.32 d0, d0
2301 sub \INPTR, \INPTR, #1
2302 vld1.8 {d0[3]}, [\INPTR]
2303 sub \INPTR, \INPTR, #1
2304 vld1.8 {d0[2]}, [\INPTR]
2305 sub \INPTR, \INPTR, #1
2306 vld1.8 {d0[1]}, [\INPTR]
2307 sub \INPTR, \INPTR, #1
2308 vld1.8 {d0[0]}, [\INPTR]
23092:
2310 tst \WIDTH, #8
2311 beq 2f
2312 vmov d1, d0
2313 sub \INPTR, \INPTR, #8
2314 vld1.8 {d0}, [\INPTR]
23152: /* upsample the remaining pixels */
2316 vmovl.u8 q8, d0
2317 vext.8 q2, q1, q0, #15
2318 vmovl.u8 q9, d1
2319 vaddw.u8 q10, q15, d4
2320 vaddw.u8 q11, q15, d5
2321 vmlal.u8 q8, d4, d28
2322 vmlal.u8 q9, d5, d28
2323 vmlal.u8 q10, d0, d28
2324 vmlal.u8 q11, d1, d28
2325 vrshrn.u16 d10, q8, #2
2326 vrshrn.u16 d12, q9, #2
2327 vshrn.u16 d11, q10, #2
2328 vshrn.u16 d13, q11, #2
2329 vzip.8 d10, d11
2330 vzip.8 d12, d13
2331 /* store the remaining pixels */
2332 tst \WIDTH, #8
2333 beq 2f
2334 vst1.8 {d10, d11}, [\OUTPTR]!
2335 vmov q5, q6
23362:
2337 tst \WIDTH, #4
2338 beq 2f
2339 vst1.8 {d10}, [\OUTPTR]!
2340 vmov d10, d11
23412:
2342 tst \WIDTH, #2
2343 beq 2f
2344 vst1.8 {d10[0]}, [\OUTPTR]!
2345 vst1.8 {d10[1]}, [\OUTPTR]!
2346 vst1.8 {d10[2]}, [\OUTPTR]!
2347 vst1.8 {d10[3]}, [\OUTPTR]!
2348 vext.8 d10, d10, d10, #4
23492:
2350 tst \WIDTH, #1
2351 beq 2f
2352 vst1.8 {d10[0]}, [\OUTPTR]!
2353 vst1.8 {d10[1]}, [\OUTPTR]!
23542:
23559:
2356.endm
2357
2358asm_function jsimd_h2v1_fancy_upsample_neon
2359
2360 MAX_V_SAMP_FACTOR .req r0
2361 DOWNSAMPLED_WIDTH .req r1
2362 INPUT_DATA .req r2
2363 OUTPUT_DATA_PTR .req r3
2364 OUTPUT_DATA .req OUTPUT_DATA_PTR
2365
2366 OUTPTR .req r4
2367 INPTR .req r5
2368 WIDTH .req ip
2369 TMP .req lr
2370
2371 push {r4, r5, r6, lr}
2372 vpush {d8-d15}
2373
2374 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2375 cmp MAX_V_SAMP_FACTOR, #0
2376 ble 99f
2377
2378 /* initialize constants */
2379 vmov.u8 d28, #3
2380 vmov.u16 q15, #1
238111:
2382 ldr INPTR, [INPUT_DATA], #4
2383 ldr OUTPTR, [OUTPUT_DATA], #4
2384 mov WIDTH, DOWNSAMPLED_WIDTH
2385 upsample_row OUTPTR, INPTR, WIDTH, TMP
2386 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2387 bgt 11b
2388
238999:
2390 vpop {d8-d15}
2391 pop {r4, r5, r6, pc}
2392
2393 .unreq MAX_V_SAMP_FACTOR
2394 .unreq DOWNSAMPLED_WIDTH
2395 .unreq INPUT_DATA
2396 .unreq OUTPUT_DATA_PTR
2397 .unreq OUTPUT_DATA
2398
2399 .unreq OUTPTR
2400 .unreq INPTR
2401 .unreq WIDTH
2402 .unreq TMP
2403
2404.endfunc
2405
2406.purgem upsample16
2407.purgem upsample32
2408.purgem upsample_row