blob: 7f0f8d0dca19d9c99d3847ed243420cbc8b3ac33 [file] [log] [blame]
DRC535674b2014-12-22 01:00:42 +00001/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
DRCd71a6e02015-01-11 06:34:47 +00004 * Copyright (C) 2014-2015, D. R. Commander.
DRC535674b2014-12-22 01:00:42 +00005 * All rights reserved.
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* SLOW INTEGER INVERSE DCT */
24
25#include "jsimd_altivec.h"
26
27
28#define F_0_298 2446 /* FIX(0.298631336) */
29#define F_0_390 3196 /* FIX(0.390180644) */
30#define F_0_541 4433 /* FIX(0.541196100) */
31#define F_0_765 6270 /* FIX(0.765366865) */
32#define F_0_899 7373 /* FIX(0.899976223) */
33#define F_1_175 9633 /* FIX(1.175875602) */
34#define F_1_501 12299 /* FIX(1.501321110) */
35#define F_1_847 15137 /* FIX(1.847759065) */
36#define F_1_961 16069 /* FIX(1.961570560) */
37#define F_2_053 16819 /* FIX(2.053119869) */
38#define F_2_562 20995 /* FIX(2.562915447) */
39#define F_3_072 25172 /* FIX(3.072711026) */
40
41#define CONST_BITS 13
42#define PASS1_BITS 2
43#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
45
46
47#define DO_IDCT(in, PASS) \
48{ \
DRCff30c632014-12-23 02:42:59 +000049 /* Even part \
50 * \
51 * (Original) \
52 * z1 = (z2 + z3) * 0.541196100; \
53 * tmp2 = z1 + z3 * -1.847759065; \
54 * tmp3 = z1 + z2 * 0.765366865; \
55 * \
56 * (This implementation) \
57 * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
58 * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
59 */ \
DRC535674b2014-12-22 01:00:42 +000060 \
61 in##26l = vec_mergeh(in##2, in##6); \
62 in##26h = vec_mergel(in##2, in##6); \
63 \
DRCa6a24c22015-01-13 10:00:12 +000064 tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
65 tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
66 tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
67 tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
DRC535674b2014-12-22 01:00:42 +000068 \
69 tmp0 = vec_add(in##0, in##4); \
70 tmp1 = vec_sub(in##0, in##4); \
71 \
72 tmp0l = vec_unpackh(tmp0); \
73 tmp0h = vec_unpackl(tmp0); \
74 tmp0l = vec_sl(tmp0l, const_bits); \
75 tmp0h = vec_sl(tmp0h, const_bits); \
76 tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
77 tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
78 \
79 tmp10l = vec_add(tmp0l, tmp3l); \
80 tmp10h = vec_add(tmp0h, tmp3h); \
81 tmp13l = vec_sub(tmp0l, tmp3l); \
82 tmp13h = vec_sub(tmp0h, tmp3h); \
83 \
84 tmp1l = vec_unpackh(tmp1); \
85 tmp1h = vec_unpackl(tmp1); \
86 tmp1l = vec_sl(tmp1l, const_bits); \
87 tmp1h = vec_sl(tmp1h, const_bits); \
88 tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
89 tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
90 \
91 tmp11l = vec_add(tmp1l, tmp2l); \
92 tmp11h = vec_add(tmp1h, tmp2h); \
93 tmp12l = vec_sub(tmp1l, tmp2l); \
94 tmp12h = vec_sub(tmp1h, tmp2h); \
95 \
96 /* Odd part */ \
97 \
98 z3 = vec_add(in##3, in##7); \
99 z4 = vec_add(in##1, in##5); \
100 \
DRCff30c632014-12-23 02:42:59 +0000101 /* (Original) \
102 * z5 = (z3 + z4) * 1.175875602; \
103 * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
104 * z3 += z5; z4 += z5; \
105 * \
106 * (This implementation) \
107 * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
108 * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
109 */ \
110 \
DRC535674b2014-12-22 01:00:42 +0000111 z34l = vec_mergeh(z3, z4); \
112 z34h = vec_mergel(z3, z4); \
113 \
DRCa6a24c22015-01-13 10:00:12 +0000114 z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
115 z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
116 z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
117 z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
DRC535674b2014-12-22 01:00:42 +0000118 \
DRCff30c632014-12-23 02:42:59 +0000119 /* (Original) \
120 * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
121 * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
122 * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
123 * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
124 * tmp0 += z1 + z3; tmp1 += z2 + z4; \
125 * tmp2 += z2 + z3; tmp3 += z1 + z4; \
126 * \
127 * (This implementation) \
128 * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
129 * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
130 * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
131 * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
132 * tmp0 += z3; tmp1 += z4; \
133 * tmp2 += z3; tmp3 += z4; \
134 */ \
135 \
DRC535674b2014-12-22 01:00:42 +0000136 in##71l = vec_mergeh(in##7, in##1); \
137 in##71h = vec_mergel(in##7, in##1); \
138 \
139 tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
140 tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
141 tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
142 tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
143 \
144 in##53l = vec_mergeh(in##5, in##3); \
145 in##53h = vec_mergel(in##5, in##3); \
146 \
147 tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
148 tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
149 tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
150 tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
151 \
152 /* Final output stage */ \
153 \
154 out0l = vec_add(tmp10l, tmp3l); \
155 out0h = vec_add(tmp10h, tmp3h); \
156 out7l = vec_sub(tmp10l, tmp3l); \
157 out7h = vec_sub(tmp10h, tmp3h); \
158 \
159 out0l = vec_sra(out0l, descale_p##PASS); \
160 out0h = vec_sra(out0h, descale_p##PASS); \
161 out7l = vec_sra(out7l, descale_p##PASS); \
162 out7h = vec_sra(out7h, descale_p##PASS); \
163 \
164 out0 = vec_pack(out0l, out0h); \
165 out7 = vec_pack(out7l, out7h); \
166 \
167 out1l = vec_add(tmp11l, tmp2l); \
168 out1h = vec_add(tmp11h, tmp2h); \
169 out6l = vec_sub(tmp11l, tmp2l); \
170 out6h = vec_sub(tmp11h, tmp2h); \
171 \
172 out1l = vec_sra(out1l, descale_p##PASS); \
173 out1h = vec_sra(out1h, descale_p##PASS); \
174 out6l = vec_sra(out6l, descale_p##PASS); \
175 out6h = vec_sra(out6h, descale_p##PASS); \
176 \
177 out1 = vec_pack(out1l, out1h); \
178 out6 = vec_pack(out6l, out6h); \
179 \
180 out2l = vec_add(tmp12l, tmp1l); \
181 out2h = vec_add(tmp12h, tmp1h); \
182 out5l = vec_sub(tmp12l, tmp1l); \
183 out5h = vec_sub(tmp12h, tmp1h); \
184 \
185 out2l = vec_sra(out2l, descale_p##PASS); \
186 out2h = vec_sra(out2h, descale_p##PASS); \
187 out5l = vec_sra(out5l, descale_p##PASS); \
188 out5h = vec_sra(out5h, descale_p##PASS); \
189 \
190 out2 = vec_pack(out2l, out2h); \
191 out5 = vec_pack(out5l, out5h); \
192 \
193 out3l = vec_add(tmp13l, tmp0l); \
194 out3h = vec_add(tmp13h, tmp0h); \
195 out4l = vec_sub(tmp13l, tmp0l); \
196 out4h = vec_sub(tmp13h, tmp0h); \
197 \
198 out3l = vec_sra(out3l, descale_p##PASS); \
199 out3h = vec_sra(out3h, descale_p##PASS); \
200 out4l = vec_sra(out4l, descale_p##PASS); \
201 out4h = vec_sra(out4h, descale_p##PASS); \
202 \
203 out3 = vec_pack(out3l, out3h); \
204 out4 = vec_pack(out4l, out4h); \
205}
206
207
208void
209jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
210 JSAMPARRAY output_buf, JDIMENSION output_col)
211{
212 short *dct_table = (short *)dct_table_;
DRCa6a24c22015-01-13 10:00:12 +0000213 int *outptr;
214
DRC535674b2014-12-22 01:00:42 +0000215 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
216 col0, col1, col2, col3, col4, col5, col6, col7,
217 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
218 tmp0, tmp1, tmp2, tmp3, z3, z4,
219 z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
220 row71l, row71h, row26l, row26h, row53l, row53h,
221 out0, out1, out2, out3, out4, out5, out6, out7;
222 __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
223 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
224 z3l, z3h, z4l, z4h,
225 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
226 out5l, out5h, out6l, out6h, out7l, out7h;
DRC246b01b2015-01-16 03:13:16 +0000227 __vector signed char outb;
DRC535674b2014-12-22 01:00:42 +0000228
229 /* Constants */
DRCa6a24c22015-01-13 10:00:12 +0000230 __vector short pw_zero = { __8X(0) },
DRC535674b2014-12-22 01:00:42 +0000231 pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
232 pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
233 pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
234 pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
235 pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
236 pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
237 pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
238 pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
239 __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
DRCa6a24c22015-01-13 10:00:12 +0000240 __vector int pd_zero = { __4X(0) },
DRC535674b2014-12-22 01:00:42 +0000241 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
242 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
243 __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
244 descale_p2 = { __4X(DESCALE_P2) },
245 const_bits = { __4X(CONST_BITS) };
246 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
247
DRC13af1392014-12-22 01:38:01 +0000248 /* Pass 1: process columns */
DRC535674b2014-12-22 01:00:42 +0000249
DRCd71a6e02015-01-11 06:34:47 +0000250 col0 = vec_ld(0, coef_block);
251 col1 = vec_ld(16, coef_block);
252 col2 = vec_ld(32, coef_block);
253 col3 = vec_ld(48, coef_block);
254 col4 = vec_ld(64, coef_block);
255 col5 = vec_ld(80, coef_block);
256 col6 = vec_ld(96, coef_block);
257 col7 = vec_ld(112, coef_block);
DRC535674b2014-12-22 01:00:42 +0000258
259 tmp1 = vec_or(col1, col2);
260 tmp2 = vec_or(col3, col4);
261 tmp1 = vec_or(tmp1, tmp2);
262 tmp3 = vec_or(col5, col6);
263 tmp3 = vec_or(tmp3, col7);
264 tmp1 = vec_or(tmp1, tmp3);
265
DRCd71a6e02015-01-11 06:34:47 +0000266 quant0 = vec_ld(0, dct_table);
DRCa6a24c22015-01-13 10:00:12 +0000267 col0 = vec_mladd(col0, quant0, pw_zero);
DRC535674b2014-12-22 01:00:42 +0000268
DRCa6a24c22015-01-13 10:00:12 +0000269 if (vec_all_eq(tmp1, pw_zero)) {
DRC535674b2014-12-22 01:00:42 +0000270 /* AC terms all zero */
271
272 col0 = vec_sl(col0, pass1_bits);
273
274 row0 = vec_splat(col0, 0);
275 row1 = vec_splat(col0, 1);
276 row2 = vec_splat(col0, 2);
277 row3 = vec_splat(col0, 3);
278 row4 = vec_splat(col0, 4);
279 row5 = vec_splat(col0, 5);
280 row6 = vec_splat(col0, 6);
281 row7 = vec_splat(col0, 7);
282
283 } else {
284
DRCd71a6e02015-01-11 06:34:47 +0000285 quant1 = vec_ld(16, dct_table);
286 quant2 = vec_ld(32, dct_table);
287 quant3 = vec_ld(48, dct_table);
288 quant4 = vec_ld(64, dct_table);
289 quant5 = vec_ld(80, dct_table);
290 quant6 = vec_ld(96, dct_table);
291 quant7 = vec_ld(112, dct_table);
DRC535674b2014-12-22 01:00:42 +0000292
DRCa6a24c22015-01-13 10:00:12 +0000293 col1 = vec_mladd(col1, quant1, pw_zero);
294 col2 = vec_mladd(col2, quant2, pw_zero);
295 col3 = vec_mladd(col3, quant3, pw_zero);
296 col4 = vec_mladd(col4, quant4, pw_zero);
297 col5 = vec_mladd(col5, quant5, pw_zero);
298 col6 = vec_mladd(col6, quant6, pw_zero);
299 col7 = vec_mladd(col7, quant7, pw_zero);
DRC535674b2014-12-22 01:00:42 +0000300
301 DO_IDCT(col, 1);
302
303 TRANSPOSE(out, row);
304 }
305
DRC13af1392014-12-22 01:38:01 +0000306 /* Pass 2: process rows */
307
DRC535674b2014-12-22 01:00:42 +0000308 DO_IDCT(row, 2);
309
DRC246b01b2015-01-16 03:13:16 +0000310 TRANSPOSE(out, col);
DRC535674b2014-12-22 01:00:42 +0000311
DRC246b01b2015-01-16 03:13:16 +0000312 outb = vec_packs(col0, col0);
313 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000314 outptr = (int *)(output_buf[0] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000315 vec_ste((__vector int)outb, 0, outptr);
316 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000317
DRC246b01b2015-01-16 03:13:16 +0000318 outb = vec_packs(col1, col1);
319 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000320 outptr = (int *)(output_buf[1] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000321 vec_ste((__vector int)outb, 0, outptr);
322 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000323
DRC246b01b2015-01-16 03:13:16 +0000324 outb = vec_packs(col2, col2);
325 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000326 outptr = (int *)(output_buf[2] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000327 vec_ste((__vector int)outb, 0, outptr);
328 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000329
DRC246b01b2015-01-16 03:13:16 +0000330 outb = vec_packs(col3, col3);
331 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000332 outptr = (int *)(output_buf[3] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000333 vec_ste((__vector int)outb, 0, outptr);
334 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000335
DRC246b01b2015-01-16 03:13:16 +0000336 outb = vec_packs(col4, col4);
337 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000338 outptr = (int *)(output_buf[4] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000339 vec_ste((__vector int)outb, 0, outptr);
340 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000341
DRC246b01b2015-01-16 03:13:16 +0000342 outb = vec_packs(col5, col5);
343 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000344 outptr = (int *)(output_buf[5] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000345 vec_ste((__vector int)outb, 0, outptr);
346 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000347
DRC246b01b2015-01-16 03:13:16 +0000348 outb = vec_packs(col6, col6);
349 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000350 outptr = (int *)(output_buf[6] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000351 vec_ste((__vector int)outb, 0, outptr);
352 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000353
DRC246b01b2015-01-16 03:13:16 +0000354 outb = vec_packs(col7, col7);
355 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000356 outptr = (int *)(output_buf[7] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000357 vec_ste((__vector int)outb, 0, outptr);
358 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000359}