blob: 282a97e00ced8fb36d7149cfc4951bc1b4654618 [file] [log] [blame]
DRC535674b2014-12-22 01:00:42 +00001/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014, D. R. Commander.
5 * All rights reserved.
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* FAST INTEGER INVERSE DCT
24 *
25 * This is similar to the SSE2 implementation, except that we left-shift the
26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28 * the elements in arg3 + the most significant 17 bits of
29 * (the elements in arg1 * the elements in arg2).
30 */
31
32#include "jsimd_altivec.h"
33
34
35#define F_1_082 277 /* FIX(1.082392200) */
36#define F_1_414 362 /* FIX(1.414213562) */
37#define F_1_847 473 /* FIX(1.847759065) */
38#define F_2_613 669 /* FIX(2.613125930) */
39#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
40
41#define CONST_BITS 8
42#define PASS1_BITS 2
43#define PRE_MULTIPLY_SCALE_BITS 2
44#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45
46
47#define DO_IDCT(in) \
48{ \
49 /* Even part */ \
50 \
51 tmp10 = vec_add(in##0, in##4); \
52 tmp11 = vec_sub(in##0, in##4); \
53 tmp13 = vec_add(in##2, in##6); \
54 \
55 tmp12 = vec_sub(in##2, in##6); \
56 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
57 tmp12 = vec_madds(tmp12, pw_F1414, zero); \
58 tmp12 = vec_sub(tmp12, tmp13); \
59 \
60 tmp0 = vec_add(tmp10, tmp13); \
61 tmp3 = vec_sub(tmp10, tmp13); \
62 tmp1 = vec_add(tmp11, tmp12); \
63 tmp2 = vec_sub(tmp11, tmp12); \
64 \
65 /* Odd part */ \
66 \
67 z13 = vec_add(in##5, in##3); \
68 z10 = vec_sub(in##5, in##3); \
69 z10s = vec_sl(z10, pre_multiply_scale_bits); \
70 z11 = vec_add(in##1, in##7); \
71 z12s = vec_sub(in##1, in##7); \
72 z12s = vec_sl(z12s, pre_multiply_scale_bits); \
73 \
74 tmp11 = vec_sub(z11, z13); \
75 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
76 tmp11 = vec_madds(tmp11, pw_F1414, zero); \
77 \
78 tmp7 = vec_add(z11, z13); \
79 \
DRCff30c632014-12-23 02:42:59 +000080 /* To avoid overflow... \
81 * \
82 * (Original) \
83 * tmp12 = -2.613125930 * z10 + z5; \
84 * \
85 * (This implementation) \
86 * tmp12 = (-1.613125930 - 1) * z10 + z5; \
87 * = -1.613125930 * z10 - z10 + z5; \
88 */ \
89 \
DRC535674b2014-12-22 01:00:42 +000090 z5 = vec_add(z10s, z12s); \
91 z5 = vec_madds(z5, pw_F1847, zero); \
92 \
93 tmp10 = vec_madds(z12s, pw_F1082, zero); \
94 tmp10 = vec_sub(tmp10, z5); \
95 tmp12 = vec_madds(z10s, pw_MF1613, z5); \
96 tmp12 = vec_sub(tmp12, z10); \
97 \
98 tmp6 = vec_sub(tmp12, tmp7); \
99 tmp5 = vec_sub(tmp11, tmp6); \
100 tmp4 = vec_add(tmp10, tmp5); \
101 \
102 out0 = vec_add(tmp0, tmp7); \
103 out1 = vec_add(tmp1, tmp6); \
104 out2 = vec_add(tmp2, tmp5); \
105 out3 = vec_sub(tmp3, tmp4); \
106 out4 = vec_add(tmp3, tmp4); \
107 out5 = vec_sub(tmp2, tmp5); \
108 out6 = vec_sub(tmp1, tmp6); \
109 out7 = vec_sub(tmp0, tmp7); \
110}
111
112
113void
114jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
115 JSAMPARRAY output_buf, JDIMENSION output_col)
116{
117 short *dct_table = (short *)dct_table_;
118 __vector short row0, row1, row2, row3, row4, row5, row6, row7,
119 col0, col1, col2, col3, col4, col5, col6, col7,
120 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
121 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
122 z5, z10, z10s, z11, z12s, z13,
123 out0, out1, out2, out3, out4, out5, out6, out7;
124 __vector signed char outb;
DRC45453082014-12-22 16:04:17 +0000125 int *outptr;
DRC535674b2014-12-22 01:00:42 +0000126
127 /* Constants */
128 __vector short zero = { __8X(0) },
129 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
130 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
131 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
132 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
133 __vector unsigned short
134 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
135 pass1_bits3 = { __8X(PASS1_BITS + 3) };
136 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
137
DRC13af1392014-12-22 01:38:01 +0000138 /* Pass 1: process columns */
DRC535674b2014-12-22 01:00:42 +0000139
DRC45453082014-12-22 16:04:17 +0000140 col0 = vec_ld(0, coef_block);
141 col1 = vec_ld(16, coef_block);
142 col2 = vec_ld(32, coef_block);
143 col3 = vec_ld(48, coef_block);
144 col4 = vec_ld(64, coef_block);
145 col5 = vec_ld(80, coef_block);
146 col6 = vec_ld(96, coef_block);
147 col7 = vec_ld(112, coef_block);
DRC535674b2014-12-22 01:00:42 +0000148
149 tmp1 = vec_or(col1, col2);
150 tmp2 = vec_or(col3, col4);
151 tmp1 = vec_or(tmp1, tmp2);
152 tmp3 = vec_or(col5, col6);
153 tmp3 = vec_or(tmp3, col7);
154 tmp1 = vec_or(tmp1, tmp3);
155
156 quant0 = *(__vector short *)&dct_table[0];
157 col0 = vec_mladd(col0, quant0, zero);
158
159 if (vec_all_eq(tmp1, zero)) {
160 /* AC terms all zero */
161
162 row0 = vec_splat(col0, 0);
163 row1 = vec_splat(col0, 1);
164 row2 = vec_splat(col0, 2);
165 row3 = vec_splat(col0, 3);
166 row4 = vec_splat(col0, 4);
167 row5 = vec_splat(col0, 5);
168 row6 = vec_splat(col0, 6);
169 row7 = vec_splat(col0, 7);
170
171 } else {
172
173 quant1 = *(__vector short *)&dct_table[8];
174 quant2 = *(__vector short *)&dct_table[16];
175 quant3 = *(__vector short *)&dct_table[24];
176 quant4 = *(__vector short *)&dct_table[32];
177 quant5 = *(__vector short *)&dct_table[40];
178 quant6 = *(__vector short *)&dct_table[48];
179 quant7 = *(__vector short *)&dct_table[56];
180
181 col1 = vec_mladd(col1, quant1, zero);
182 col2 = vec_mladd(col2, quant2, zero);
183 col3 = vec_mladd(col3, quant3, zero);
184 col4 = vec_mladd(col4, quant4, zero);
185 col5 = vec_mladd(col5, quant5, zero);
186 col6 = vec_mladd(col6, quant6, zero);
187 col7 = vec_mladd(col7, quant7, zero);
188
189 DO_IDCT(col);
190
191 TRANSPOSE(out, row);
192 }
193
DRC13af1392014-12-22 01:38:01 +0000194 /* Pass 2: process rows */
DRC535674b2014-12-22 01:00:42 +0000195
196 DO_IDCT(row);
197
198 out0 = vec_sra(out0, pass1_bits3);
199 out1 = vec_sra(out1, pass1_bits3);
200 out2 = vec_sra(out2, pass1_bits3);
201 out3 = vec_sra(out3, pass1_bits3);
202 out4 = vec_sra(out4, pass1_bits3);
203 out5 = vec_sra(out5, pass1_bits3);
204 out6 = vec_sra(out6, pass1_bits3);
205 out7 = vec_sra(out7, pass1_bits3);
206
207 TRANSPOSE(out, col);
208
DRC45453082014-12-22 16:04:17 +0000209 outb = vec_packs(col0, col0);
DRC535674b2014-12-22 01:00:42 +0000210 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000211 outptr = (int *)(output_buf[0] + output_col);
212 vec_ste((__vector int)outb, 0, outptr);
213 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000214
DRC45453082014-12-22 16:04:17 +0000215 outb = vec_packs(col1, col1);
DRC535674b2014-12-22 01:00:42 +0000216 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000217 outptr = (int *)(output_buf[1] + output_col);
218 vec_ste((__vector int)outb, 0, outptr);
219 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000220
DRC45453082014-12-22 16:04:17 +0000221 outb = vec_packs(col2, col2);
DRC535674b2014-12-22 01:00:42 +0000222 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000223 outptr = (int *)(output_buf[2] + output_col);
224 vec_ste((__vector int)outb, 0, outptr);
225 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000226
DRC45453082014-12-22 16:04:17 +0000227 outb = vec_packs(col3, col3);
DRC535674b2014-12-22 01:00:42 +0000228 outb = vec_add(outb, pb_centerjsamp);
DRC45453082014-12-22 16:04:17 +0000229 outptr = (int *)(output_buf[3] + output_col);
230 vec_ste((__vector int)outb, 0, outptr);
231 vec_ste((__vector int)outb, 4, outptr);
232
233 outb = vec_packs(col4, col4);
234 outb = vec_add(outb, pb_centerjsamp);
235 outptr = (int *)(output_buf[4] + output_col);
236 vec_ste((__vector int)outb, 0, outptr);
237 vec_ste((__vector int)outb, 4, outptr);
238
239 outb = vec_packs(col5, col5);
240 outb = vec_add(outb, pb_centerjsamp);
241 outptr = (int *)(output_buf[5] + output_col);
242 vec_ste((__vector int)outb, 0, outptr);
243 vec_ste((__vector int)outb, 4, outptr);
244
245 outb = vec_packs(col6, col6);
246 outb = vec_add(outb, pb_centerjsamp);
247 outptr = (int *)(output_buf[6] + output_col);
248 vec_ste((__vector int)outb, 0, outptr);
249 vec_ste((__vector int)outb, 4, outptr);
250
251 outb = vec_packs(col7, col7);
252 outb = vec_add(outb, pb_centerjsamp);
253 outptr = (int *)(output_buf[7] + output_col);
254 vec_ste((__vector int)outb, 0, outptr);
255 vec_ste((__vector int)outb, 4, outptr);
DRC535674b2014-12-22 01:00:42 +0000256}