blob: 63d6d8ca1f47205ea35c7422ee6451f7c254bb2c [file] [log] [blame]
DRC52a4ec62015-01-13 09:02:29 +00001/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2015, D. R. Commander.
5 * All rights reserved.
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* CHROMA UPSAMPLING */
24
25#include "jsimd_altivec.h"
26
27
28void
29jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
30 JDIMENSION downsampled_width,
31 JSAMPARRAY input_data,
32 JSAMPARRAY *output_data_ptr)
33{
34 JSAMPARRAY output_data = *output_data_ptr;
35 JSAMPROW inptr, outptr;
DRC2517ef72015-01-14 10:45:31 +000036 int inrow, incol;
DRC52a4ec62015-01-13 09:02:29 +000037
DRCa6a24c22015-01-13 10:00:12 +000038 __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
39 out;
40 __vector short this0e, this0o, this0l, this0h, last0l, last0h,
41 next0l, next0h, outle, outhe, outlo, outho;
DRC52a4ec62015-01-13 09:02:29 +000042
43 /* Constants */
DRCa6a24c22015-01-13 10:00:12 +000044 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
DRC52a4ec62015-01-13 09:02:29 +000045 last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
46 last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
47 next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
48 next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
DRC771ab192015-02-20 19:57:21 +000049#if __BIG_ENDIAN__
DRC52a4ec62015-01-13 09:02:29 +000050 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
DRC771ab192015-02-20 19:57:21 +000051#else
52 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
53#endif
DRC52a4ec62015-01-13 09:02:29 +000054 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
55
56 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
57 inptr = input_data[inrow];
58 outptr = output_data[inrow];
59
DRC2517ef72015-01-14 10:45:31 +000060 if (downsampled_width & 15)
61 inptr[downsampled_width] = inptr[downsampled_width - 1];
62
DRCa6a24c22015-01-13 10:00:12 +000063 this0 = vec_ld(0, inptr);
64 p_last0 = vec_perm(this0, this0, last_index_col0);
65 last0 = this0;
DRC52a4ec62015-01-13 09:02:29 +000066
DRC2517ef72015-01-14 10:45:31 +000067 for (incol = downsampled_width; incol > 0;
68 incol -= 16, inptr += 16, outptr += 32) {
DRC52a4ec62015-01-13 09:02:29 +000069
DRC2517ef72015-01-14 10:45:31 +000070 if (downsampled_width - incol > 0) {
DRCa6a24c22015-01-13 10:00:12 +000071 p_last0 = vec_perm(last0, this0, last_index);
72 last0 = this0;
DRC52a4ec62015-01-13 09:02:29 +000073 }
74
DRC2517ef72015-01-14 10:45:31 +000075 if (incol <= 16)
DRCa6a24c22015-01-13 10:00:12 +000076 p_next0 = vec_perm(this0, this0, next_index_lastcol);
DRC52a4ec62015-01-13 09:02:29 +000077 else {
DRCa6a24c22015-01-13 10:00:12 +000078 next0 = vec_ld(16, inptr);
79 p_next0 = vec_perm(this0, next0, next_index);
DRC52a4ec62015-01-13 09:02:29 +000080 }
81
DRCa6a24c22015-01-13 10:00:12 +000082 this0e = (__vector short)vec_mule(this0, pb_three);
83 this0o = (__vector short)vec_mulo(this0, pb_three);
84 this0l = vec_mergeh(this0e, this0o);
85 this0h = vec_mergel(this0e, this0o);
DRC52a4ec62015-01-13 09:02:29 +000086
DRC771ab192015-02-20 19:57:21 +000087 last0l = (__vector short)VEC_UNPACKHU(p_last0);
88 last0h = (__vector short)VEC_UNPACKLU(p_last0);
DRCa6a24c22015-01-13 10:00:12 +000089 last0l = vec_add(last0l, pw_one);
DRC52a4ec62015-01-13 09:02:29 +000090
DRC771ab192015-02-20 19:57:21 +000091 next0l = (__vector short)VEC_UNPACKHU(p_next0);
92 next0h = (__vector short)VEC_UNPACKLU(p_next0);
DRCa6a24c22015-01-13 10:00:12 +000093 next0l = vec_add(next0l, pw_two);
DRC52a4ec62015-01-13 09:02:29 +000094
DRCa6a24c22015-01-13 10:00:12 +000095 outle = vec_add(this0l, last0l);
DRCa6a24c22015-01-13 10:00:12 +000096 outlo = vec_add(this0l, next0l);
DRC52a4ec62015-01-13 09:02:29 +000097 outle = vec_sr(outle, (__vector unsigned short)pw_two);
DRC52a4ec62015-01-13 09:02:29 +000098 outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
DRC52a4ec62015-01-13 09:02:29 +000099
100 out = vec_perm((__vector unsigned char)outle,
101 (__vector unsigned char)outlo, merge_pack_index);
102 vec_st(out, 0, outptr);
DRC2517ef72015-01-14 10:45:31 +0000103
104 if (incol > 8) {
105 last0h = vec_add(last0h, pw_one);
106 next0h = vec_add(next0h, pw_two);
107
108 outhe = vec_add(this0h, last0h);
109 outho = vec_add(this0h, next0h);
110 outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
111 outho = vec_sr(outho, (__vector unsigned short)pw_two);
112
113 out = vec_perm((__vector unsigned char)outhe,
114 (__vector unsigned char)outho, merge_pack_index);
115 vec_st(out, 16, outptr);
116 }
DRC52a4ec62015-01-13 09:02:29 +0000117
DRCa6a24c22015-01-13 10:00:12 +0000118 this0 = next0;
DRC52a4ec62015-01-13 09:02:29 +0000119 }
120 }
121}
122
123
124void
125jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
126 JDIMENSION downsampled_width,
127 JSAMPARRAY input_data,
128 JSAMPARRAY *output_data_ptr)
129{
130 JSAMPARRAY output_data = *output_data_ptr;
131 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
DRC2517ef72015-01-14 10:45:31 +0000132 int inrow, outrow, incol;
DRC52a4ec62015-01-13 09:02:29 +0000133
DRCa6a24c22015-01-13 10:00:12 +0000134 __vector unsigned char this_1, this0, this1, out;
135 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
DRC52a4ec62015-01-13 09:02:29 +0000136 lastcolsum_1h, lastcolsum1h,
137 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
138 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
139 nextcolsum_1l = {0}, nextcolsum_1h = {0},
140 nextcolsum1l = {0}, nextcolsum1h = {0},
141 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
142 tmpl, tmph, outle, outhe, outlo, outho;
143
144 /* Constants */
145 __vector unsigned char pb_zero = { __16X(0) },
146 last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
147 last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
148 next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
149 next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
DRC771ab192015-02-20 19:57:21 +0000150#if __BIG_ENDIAN__
DRC52a4ec62015-01-13 09:02:29 +0000151 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
DRC771ab192015-02-20 19:57:21 +0000152#else
153 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
154#endif
DRC52a4ec62015-01-13 09:02:29 +0000155 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
156 pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
157 __vector unsigned short pw_four = { __8X(4) };
158
159 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
160
161 inptr_1 = input_data[inrow - 1];
162 inptr0 = input_data[inrow];
163 inptr1 = input_data[inrow + 1];
164 outptr0 = output_data[outrow++];
165 outptr1 = output_data[outrow++];
166
DRC2517ef72015-01-14 10:45:31 +0000167 if (downsampled_width & 15) {
168 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
169 inptr0[downsampled_width] = inptr0[downsampled_width - 1];
170 inptr1[downsampled_width] = inptr1[downsampled_width - 1];
171 }
172
DRCa6a24c22015-01-13 10:00:12 +0000173 this0 = vec_ld(0, inptr0);
DRC771ab192015-02-20 19:57:21 +0000174 this0l = (__vector short)VEC_UNPACKHU(this0);
175 this0h = (__vector short)VEC_UNPACKLU(this0);
DRCa6a24c22015-01-13 10:00:12 +0000176 this0l = vec_mladd(this0l, pw_three, pw_zero);
177 this0h = vec_mladd(this0h, pw_three, pw_zero);
DRC52a4ec62015-01-13 09:02:29 +0000178
DRCa6a24c22015-01-13 10:00:12 +0000179 this_1 = vec_ld(0, inptr_1);
DRC771ab192015-02-20 19:57:21 +0000180 this_1l = (__vector short)VEC_UNPACKHU(this_1);
181 this_1h = (__vector short)VEC_UNPACKLU(this_1);
DRCa6a24c22015-01-13 10:00:12 +0000182 thiscolsum_1l = vec_add(this0l, this_1l);
183 thiscolsum_1h = vec_add(this0h, this_1h);
DRC52a4ec62015-01-13 09:02:29 +0000184 lastcolsum_1h = thiscolsum_1h;
185 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
186 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
187
DRCa6a24c22015-01-13 10:00:12 +0000188 this1 = vec_ld(0, inptr1);
DRC771ab192015-02-20 19:57:21 +0000189 this1l = (__vector short)VEC_UNPACKHU(this1);
190 this1h = (__vector short)VEC_UNPACKLU(this1);
DRCa6a24c22015-01-13 10:00:12 +0000191 thiscolsum1l = vec_add(this0l, this1l);
192 thiscolsum1h = vec_add(this0h, this1h);
DRC52a4ec62015-01-13 09:02:29 +0000193 lastcolsum1h = thiscolsum1h;
194 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
195 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
196
DRC2517ef72015-01-14 10:45:31 +0000197 for (incol = downsampled_width; incol > 0;
198 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
DRC52a4ec62015-01-13 09:02:29 +0000199 outptr0 += 32, outptr1 += 32) {
200
DRC2517ef72015-01-14 10:45:31 +0000201 if (downsampled_width - incol > 0) {
DRC52a4ec62015-01-13 09:02:29 +0000202 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
203 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
204 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
206 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
207 }
208
DRC2517ef72015-01-14 10:45:31 +0000209 if (incol <= 16) {
DRC52a4ec62015-01-13 09:02:29 +0000210 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
211 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
212 next_index_lastcol);
213 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
214 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
215 next_index_lastcol);
216 } else {
DRCa6a24c22015-01-13 10:00:12 +0000217 this0 = vec_ld(16, inptr0);
DRC771ab192015-02-20 19:57:21 +0000218 this0l = (__vector short)VEC_UNPACKHU(this0);
219 this0h = (__vector short)VEC_UNPACKLU(this0);
DRCa6a24c22015-01-13 10:00:12 +0000220 this0l = vec_mladd(this0l, pw_three, pw_zero);
221 this0h = vec_mladd(this0h, pw_three, pw_zero);
DRC52a4ec62015-01-13 09:02:29 +0000222
DRCa6a24c22015-01-13 10:00:12 +0000223 this_1 = vec_ld(16, inptr_1);
DRC771ab192015-02-20 19:57:21 +0000224 this_1l = (__vector short)VEC_UNPACKHU(this_1);
225 this_1h = (__vector short)VEC_UNPACKLU(this_1);
DRCa6a24c22015-01-13 10:00:12 +0000226 nextcolsum_1l = vec_add(this0l, this_1l);
227 nextcolsum_1h = vec_add(this0h, this_1h);
DRC52a4ec62015-01-13 09:02:29 +0000228 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
229 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
230
DRCa6a24c22015-01-13 10:00:12 +0000231 this1 = vec_ld(16, inptr1);
DRC771ab192015-02-20 19:57:21 +0000232 this1l = (__vector short)VEC_UNPACKHU(this1);
233 this1h = (__vector short)VEC_UNPACKLU(this1);
DRCa6a24c22015-01-13 10:00:12 +0000234 nextcolsum1l = vec_add(this0l, this1l);
235 nextcolsum1h = vec_add(this0h, this1h);
DRC52a4ec62015-01-13 09:02:29 +0000236 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
237 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
238 }
239
240 /* Process the upper row */
241
242 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
DRC52a4ec62015-01-13 09:02:29 +0000243 outle = vec_add(tmpl, p_lastcolsum_1l);
DRC52a4ec62015-01-13 09:02:29 +0000244 outle = vec_add(outle, pw_eight);
DRC52a4ec62015-01-13 09:02:29 +0000245 outle = vec_sr(outle, pw_four);
DRC52a4ec62015-01-13 09:02:29 +0000246
247 outlo = vec_add(tmpl, p_nextcolsum_1l);
DRC52a4ec62015-01-13 09:02:29 +0000248 outlo = vec_add(outlo, pw_seven);
DRC52a4ec62015-01-13 09:02:29 +0000249 outlo = vec_sr(outlo, pw_four);
DRC52a4ec62015-01-13 09:02:29 +0000250
251 out = vec_perm((__vector unsigned char)outle,
252 (__vector unsigned char)outlo, merge_pack_index);
253 vec_st(out, 0, outptr0);
254
DRC2517ef72015-01-14 10:45:31 +0000255 if (incol > 8) {
256 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
257 outhe = vec_add(tmph, p_lastcolsum_1h);
258 outhe = vec_add(outhe, pw_eight);
259 outhe = vec_sr(outhe, pw_four);
260
261 outho = vec_add(tmph, p_nextcolsum_1h);
262 outho = vec_add(outho, pw_seven);
263 outho = vec_sr(outho, pw_four);
264
265 out = vec_perm((__vector unsigned char)outhe,
266 (__vector unsigned char)outho, merge_pack_index);
267 vec_st(out, 16, outptr0);
268 }
DRC52a4ec62015-01-13 09:02:29 +0000269
270 /* Process the lower row */
271
272 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
DRC52a4ec62015-01-13 09:02:29 +0000273 outle = vec_add(tmpl, p_lastcolsum1l);
DRC52a4ec62015-01-13 09:02:29 +0000274 outle = vec_add(outle, pw_eight);
DRC52a4ec62015-01-13 09:02:29 +0000275 outle = vec_sr(outle, pw_four);
DRC52a4ec62015-01-13 09:02:29 +0000276
277 outlo = vec_add(tmpl, p_nextcolsum1l);
DRC52a4ec62015-01-13 09:02:29 +0000278 outlo = vec_add(outlo, pw_seven);
DRC52a4ec62015-01-13 09:02:29 +0000279 outlo = vec_sr(outlo, pw_four);
DRC52a4ec62015-01-13 09:02:29 +0000280
281 out = vec_perm((__vector unsigned char)outle,
282 (__vector unsigned char)outlo, merge_pack_index);
283 vec_st(out, 0, outptr1);
DRC2517ef72015-01-14 10:45:31 +0000284
285 if (incol > 8) {
286 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
287 outhe = vec_add(tmph, p_lastcolsum1h);
288 outhe = vec_add(outhe, pw_eight);
289 outhe = vec_sr(outhe, pw_four);
290
291 outho = vec_add(tmph, p_nextcolsum1h);
292 outho = vec_add(outho, pw_seven);
293 outho = vec_sr(outho, pw_four);
294
295 out = vec_perm((__vector unsigned char)outhe,
296 (__vector unsigned char)outho, merge_pack_index);
297 vec_st(out, 16, outptr1);
298 }
DRC52a4ec62015-01-13 09:02:29 +0000299
300 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
301 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
302 }
303 }
304}
DRCc641cdd2015-01-14 15:41:11 +0000305
306
307/* These are rarely used (mainly just for decompressing YCCK images) */
308
309void
310jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
311 JDIMENSION output_width,
312 JSAMPARRAY input_data,
DRCbd498032016-02-19 08:53:33 -0600313 JSAMPARRAY *output_data_ptr)
DRCc641cdd2015-01-14 15:41:11 +0000314{
315 JSAMPARRAY output_data = *output_data_ptr;
316 JSAMPROW inptr, outptr;
317 int inrow, incol;
318
319 __vector unsigned char in, inl, inh;
320
321 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
322 inptr = input_data[inrow];
323 outptr = output_data[inrow];
324
325 for (incol = (output_width + 31) & (~31); incol > 0;
326 incol -= 64, inptr += 32, outptr += 64) {
327
328 in = vec_ld(0, inptr);
329 inl = vec_mergeh(in, in);
330 inh = vec_mergel(in, in);
331
332 vec_st(inl, 0, outptr);
333 vec_st(inh, 16, outptr);
334
335 if (incol > 32) {
336 in = vec_ld(16, inptr);
337 inl = vec_mergeh(in, in);
338 inh = vec_mergel(in, in);
339
340 vec_st(inl, 32, outptr);
341 vec_st(inh, 48, outptr);
342 }
343 }
344 }
345}
346
347
348void
349jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
350 JDIMENSION output_width,
351 JSAMPARRAY input_data,
DRCbd498032016-02-19 08:53:33 -0600352 JSAMPARRAY *output_data_ptr)
DRCc641cdd2015-01-14 15:41:11 +0000353{
354 JSAMPARRAY output_data = *output_data_ptr;
355 JSAMPROW inptr, outptr0, outptr1;
356 int inrow, outrow, incol;
357
358 __vector unsigned char in, inl, inh;
359
360 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
361
362 inptr = input_data[inrow];
363 outptr0 = output_data[outrow++];
364 outptr1 = output_data[outrow++];
365
366 for (incol = (output_width + 31) & (~31); incol > 0;
367 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
368
369 in = vec_ld(0, inptr);
370 inl = vec_mergeh(in, in);
371 inh = vec_mergel(in, in);
372
373 vec_st(inl, 0, outptr0);
374 vec_st(inl, 0, outptr1);
375
376 vec_st(inh, 16, outptr0);
377 vec_st(inh, 16, outptr1);
378
379 if (incol > 32) {
380 in = vec_ld(16, inptr);
381 inl = vec_mergeh(in, in);
382 inh = vec_mergel(in, in);
383
384 vec_st(inl, 32, outptr0);
385 vec_st(inl, 32, outptr1);
386
387 vec_st(inh, 48, outptr0);
388 vec_st(inh, 48, outptr1);
389 }
390 }
391 }
392}