blob: 67cbe842be0995c57e0dbe7598bb5193e0b0842d [file] [log] [blame]
DRC535674b2014-12-22 01:00:42 +00001/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
DRCd71a6e02015-01-11 06:34:47 +00004 * Copyright (C) 2014-2015, D. R. Commander.
DRC535674b2014-12-22 01:00:42 +00005 * All rights reserved.
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* FAST INTEGER INVERSE DCT
24 *
25 * This is similar to the SSE2 implementation, except that we left-shift the
26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28 * the elements in arg3 + the most significant 17 bits of
29 * (the elements in arg1 * the elements in arg2).
30 */
31
32#include "jsimd_altivec.h"
33
34
35#define F_1_082 277 /* FIX(1.082392200) */
36#define F_1_414 362 /* FIX(1.414213562) */
37#define F_1_847 473 /* FIX(1.847759065) */
38#define F_2_613 669 /* FIX(2.613125930) */
39#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
40
41#define CONST_BITS 8
42#define PASS1_BITS 2
43#define PRE_MULTIPLY_SCALE_BITS 2
44#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45
46
47#define DO_IDCT(in) \
48{ \
49 /* Even part */ \
50 \
51 tmp10 = vec_add(in##0, in##4); \
52 tmp11 = vec_sub(in##0, in##4); \
53 tmp13 = vec_add(in##2, in##6); \
54 \
55 tmp12 = vec_sub(in##2, in##6); \
56 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
DRCa6a24c22015-01-13 10:00:12 +000057 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
DRC535674b2014-12-22 01:00:42 +000058 tmp12 = vec_sub(tmp12, tmp13); \
59 \
60 tmp0 = vec_add(tmp10, tmp13); \
61 tmp3 = vec_sub(tmp10, tmp13); \
62 tmp1 = vec_add(tmp11, tmp12); \
63 tmp2 = vec_sub(tmp11, tmp12); \
64 \
65 /* Odd part */ \
66 \
67 z13 = vec_add(in##5, in##3); \
68 z10 = vec_sub(in##5, in##3); \
69 z10s = vec_sl(z10, pre_multiply_scale_bits); \
70 z11 = vec_add(in##1, in##7); \
71 z12s = vec_sub(in##1, in##7); \
72 z12s = vec_sl(z12s, pre_multiply_scale_bits); \
73 \
74 tmp11 = vec_sub(z11, z13); \
75 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
DRCa6a24c22015-01-13 10:00:12 +000076 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
DRC535674b2014-12-22 01:00:42 +000077 \
78 tmp7 = vec_add(z11, z13); \
79 \
DRCff30c632014-12-23 02:42:59 +000080 /* To avoid overflow... \
81 * \
82 * (Original) \
83 * tmp12 = -2.613125930 * z10 + z5; \
84 * \
85 * (This implementation) \
86 * tmp12 = (-1.613125930 - 1) * z10 + z5; \
87 * = -1.613125930 * z10 - z10 + z5; \
88 */ \
89 \
DRC535674b2014-12-22 01:00:42 +000090 z5 = vec_add(z10s, z12s); \
DRCa6a24c22015-01-13 10:00:12 +000091 z5 = vec_madds(z5, pw_F1847, pw_zero); \
DRC535674b2014-12-22 01:00:42 +000092 \
DRCa6a24c22015-01-13 10:00:12 +000093 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
DRC535674b2014-12-22 01:00:42 +000094 tmp10 = vec_sub(tmp10, z5); \
95 tmp12 = vec_madds(z10s, pw_MF1613, z5); \
96 tmp12 = vec_sub(tmp12, z10); \
97 \
98 tmp6 = vec_sub(tmp12, tmp7); \
99 tmp5 = vec_sub(tmp11, tmp6); \
100 tmp4 = vec_add(tmp10, tmp5); \
101 \
102 out0 = vec_add(tmp0, tmp7); \
103 out1 = vec_add(tmp1, tmp6); \
104 out2 = vec_add(tmp2, tmp5); \
105 out3 = vec_sub(tmp3, tmp4); \
106 out4 = vec_add(tmp3, tmp4); \
107 out5 = vec_sub(tmp2, tmp5); \
108 out6 = vec_sub(tmp1, tmp6); \
109 out7 = vec_sub(tmp0, tmp7); \
110}
111
112
113void
DRCbd498032016-02-19 08:53:33 -0600114jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
DRC535674b2014-12-22 01:00:42 +0000115 JSAMPARRAY output_buf, JDIMENSION output_col)
116{
117 short *dct_table = (short *)dct_table_;
DRCa6a24c22015-01-13 10:00:12 +0000118 int *outptr;
119
DRC535674b2014-12-22 01:00:42 +0000120 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
121 col0, col1, col2, col3, col4, col5, col6, col7,
122 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
123 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
124 z5, z10, z10s, z11, z12s, z13,
125 out0, out1, out2, out3, out4, out5, out6, out7;
DRC246b01b2015-01-16 03:13:16 +0000126 __vector signed char outb;
DRC535674b2014-12-22 01:00:42 +0000127
128 /* Constants */
DRCa6a24c22015-01-13 10:00:12 +0000129 __vector short pw_zero = { __8X(0) },
DRC535674b2014-12-22 01:00:42 +0000130 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
131 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
132 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
133 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
134 __vector unsigned short
135 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
136 pass1_bits3 = { __8X(PASS1_BITS + 3) };
137 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
138
DRC13af1392014-12-22 01:38:01 +0000139 /* Pass 1: process columns */
DRC535674b2014-12-22 01:00:42 +0000140
DRC45453082014-12-22 16:04:17 +0000141 col0 = vec_ld(0, coef_block);
142 col1 = vec_ld(16, coef_block);
143 col2 = vec_ld(32, coef_block);
144 col3 = vec_ld(48, coef_block);
145 col4 = vec_ld(64, coef_block);
146 col5 = vec_ld(80, coef_block);
147 col6 = vec_ld(96, coef_block);
148 col7 = vec_ld(112, coef_block);
DRC535674b2014-12-22 01:00:42 +0000149
150 tmp1 = vec_or(col1, col2);
151 tmp2 = vec_or(col3, col4);
152 tmp1 = vec_or(tmp1, tmp2);
153 tmp3 = vec_or(col5, col6);
154 tmp3 = vec_or(tmp3, col7);
155 tmp1 = vec_or(tmp1, tmp3);
156
DRCd71a6e02015-01-11 06:34:47 +0000157 quant0 = vec_ld(0, dct_table);
DRCa6a24c22015-01-13 10:00:12 +0000158 col0 = vec_mladd(col0, quant0, pw_zero);
DRC535674b2014-12-22 01:00:42 +0000159
DRCa6a24c22015-01-13 10:00:12 +0000160 if (vec_all_eq(tmp1, pw_zero)) {
DRC535674b2014-12-22 01:00:42 +0000161 /* AC terms all zero */
162
163 row0 = vec_splat(col0, 0);
164 row1 = vec_splat(col0, 1);
165 row2 = vec_splat(col0, 2);
166 row3 = vec_splat(col0, 3);
167 row4 = vec_splat(col0, 4);
168 row5 = vec_splat(col0, 5);
169 row6 = vec_splat(col0, 6);
170 row7 = vec_splat(col0, 7);
171
172 } else {
173
DRCd71a6e02015-01-11 06:34:47 +0000174 quant1 = vec_ld(16, dct_table);
175 quant2 = vec_ld(32, dct_table);
176 quant3 = vec_ld(48, dct_table);
177 quant4 = vec_ld(64, dct_table);
178 quant5 = vec_ld(80, dct_table);
179 quant6 = vec_ld(96, dct_table);
180 quant7 = vec_ld(112, dct_table);
DRC535674b2014-12-22 01:00:42 +0000181
DRCa6a24c22015-01-13 10:00:12 +0000182 col1 = vec_mladd(col1, quant1, pw_zero);
183 col2 = vec_mladd(col2, quant2, pw_zero);
184 col3 = vec_mladd(col3, quant3, pw_zero);
185 col4 = vec_mladd(col4, quant4, pw_zero);
186 col5 = vec_mladd(col5, quant5, pw_zero);
187 col6 = vec_mladd(col6, quant6, pw_zero);
188 col7 = vec_mladd(col7, quant7, pw_zero);
DRC535674b2014-12-22 01:00:42 +0000189
190 DO_IDCT(col);
191
192 TRANSPOSE(out, row);
193 }
194
DRC13af1392014-12-22 01:38:01 +0000195 /* Pass 2: process rows */
DRC535674b2014-12-22 01:00:42 +0000196
197 DO_IDCT(row);
198
199 out0 = vec_sra(out0, pass1_bits3);
200 out1 = vec_sra(out1, pass1_bits3);
201 out2 = vec_sra(out2, pass1_bits3);
202 out3 = vec_sra(out3, pass1_bits3);
203 out4 = vec_sra(out4, pass1_bits3);
204 out5 = vec_sra(out5, pass1_bits3);
205 out6 = vec_sra(out6, pass1_bits3);
206 out7 = vec_sra(out7, pass1_bits3);
207
DRC246b01b2015-01-16 03:13:16 +0000208 TRANSPOSE(out, col);
DRC535674b2014-12-22 01:00:42 +0000209
DRC246b01b2015-01-16 03:13:16 +0000210 outb = vec_packs(col0, col0);
211 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000212 outptr = (int *)(output_buf[0] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000213 vec_ste((__vector int)outb, 0, outptr);
214 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000215
DRC246b01b2015-01-16 03:13:16 +0000216 outb = vec_packs(col1, col1);
217 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000218 outptr = (int *)(output_buf[1] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000219 vec_ste((__vector int)outb, 0, outptr);
220 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000221
DRC246b01b2015-01-16 03:13:16 +0000222 outb = vec_packs(col2, col2);
223 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000224 outptr = (int *)(output_buf[2] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000225 vec_ste((__vector int)outb, 0, outptr);
226 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000227
DRC246b01b2015-01-16 03:13:16 +0000228 outb = vec_packs(col3, col3);
229 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000230 outptr = (int *)(output_buf[3] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000231 vec_ste((__vector int)outb, 0, outptr);
232 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000233
DRC246b01b2015-01-16 03:13:16 +0000234 outb = vec_packs(col4, col4);
235 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000236 outptr = (int *)(output_buf[4] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000237 vec_ste((__vector int)outb, 0, outptr);
238 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000239
DRC246b01b2015-01-16 03:13:16 +0000240 outb = vec_packs(col5, col5);
241 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000242 outptr = (int *)(output_buf[5] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000243 vec_ste((__vector int)outb, 0, outptr);
244 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000245
DRC246b01b2015-01-16 03:13:16 +0000246 outb = vec_packs(col6, col6);
247 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000248 outptr = (int *)(output_buf[6] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000249 vec_ste((__vector int)outb, 0, outptr);
250 vec_ste((__vector int)outb, 4, outptr);
DRC45453082014-12-22 16:04:17 +0000251
DRC246b01b2015-01-16 03:13:16 +0000252 outb = vec_packs(col7, col7);
253 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000254 outptr = (int *)(output_buf[7] + output_col);
DRC246b01b2015-01-16 03:13:16 +0000255 vec_ste((__vector int)outb, 0, outptr);
256 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000257}