blob: cb5c93b0f2a8fcca565fdbca5f41e1cb851deb86 [file] [log] [blame]
tengfei.zhao6553d242012-07-04 15:50:59 +08001/*
2 * Copyright (C) 2010-2011 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define JPEG_INTERNALS
18#include "jinclude.h"
19#include "jpeglib.h"
20#include "jdct.h" /* Private declarations for DCT subsystem */
21
22#ifdef ANDROID_INTELSSE2_IDCT
23#include <emmintrin.h>
24
25#if DCTSIZE != 8
26 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
27#endif
28
29#define BITS_INV_ACC 4
30#define SHIFT_INV_ROW 12
31#define SHIFT_INV_COL 5
32const short RND_INV_ROW = 2048;
33const short RND_INV_COL = 16;
34const short RND_INV_CORR = 15;
35
36static const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1};
37static const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0};
38static const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16};
39static const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15};
40
41static const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
42static const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
43static const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
44static const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
45
46static const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128};
47
48// Table for rows 0,4
49static const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = {
5016384, 21407, 16384, 8867,
5116384, -8867, 16384, -21407,
5216384, 8867, -16384, -21407,
53-16384, 21407, 16384, -8867,
5422725, 19266, 19266, -4520,
5512873, -22725, 4520, -12873,
5612873, 4520, -22725, -12873,
574520, 19266, 19266, -22725
58};
59
60// Table for rows 1,7
61static const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = {
6222725, 29692, 22725, 12299,
6322725, -12299, 22725, -29692,
6422725, 12299, -22725, -29692,
65-22725, 29692, 22725, -12299,
6631521, 26722, 26722, -6270,
6717855, -31521, 6270, -17855,
6817855, 6270, -31521, -17855,
696270, 26722, 26722, -31521
70};
71
72// Table for rows 2,6
73static const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = {
7421407, 27969, 21407, 11585,
7521407, -11585, 21407, -27969,
7621407, 11585, -21407, -27969,
77-21407, 27969, 21407, -11585,
7829692, 25172, 25172, -5906,
7916819, -29692, 5906, -16819,
8016819, 5906, -29692, -16819,
815906, 25172, 25172, -29692
82};
83
84// Table for rows 3,5
85static const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = {
8619266, 25172, 19266, 10426,
8719266, -10426, 19266, -25172,
8819266, 10426, -19266, -25172,
89-19266, 25172, 19266, -10426,
9026722, 22654, 22654, -5315,
9115137, -26722, 5315, -15137,
9215137, 5315, -26722, -15137,
935315, 22654, 22654, -26722
94};
95
96
97/*
98 * Perform dequantization and inverse DCT on one block of coefficients by SSE.
99 */
100
101GLOBAL(void)
102jpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
103 JCOEFPTR coef_block,
104 JSAMPARRAY output_buf, JDIMENSION output_col)
105{
106 __m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7;
107 int ctr;
108 JSAMPROW outptrTemp;
109 JSAMPLE *range_limit = IDCT_range_limit(cinfo);
110 short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2];
111 short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2];
112 short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2];
113 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
114 __m128i* tg3, *tg1, *tg2, *cos4;
115 __m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65;
116 __m128i t0, t1, t2, t3, t4, t5, t6, t7;
117 __m128i temp, temp2;
118 short * wsptr;
119 unsigned char * outptr;
120
121#define iDCT_8_2ROWs(table1, table2) \
122 row0 = _mm_shufflelo_epi16(row0, 0xD8); /*x7, x6, x5, x4, x3, x1, x2, x0*/ \
123 row2 = _mm_shufflelo_epi16(row2, 0xD8); \
124 tmp1 = _mm_shuffle_epi32(row0, 0); /*x2, x0, x2, x0, x2, x0, x2, x0*/ \
125 tmp5 = _mm_shuffle_epi32(row2, 0); \
126 \
127 tmp3 = _mm_shuffle_epi32(row0, 0x55); /*x3, x1, x3, x1, x3, x1, x3, x1*/ \
128 tmp7 = _mm_shuffle_epi32(row2, 0x55); \
129 row0 = _mm_shufflehi_epi16(row0, 0xD8); /*x7, x5, x6, x4, x3, x1, x2, x0*/ \
130 row2 = _mm_shufflehi_epi16(row2, 0xD8); \
131 \
132 tmp1 = _mm_madd_epi16(tmp1, * ( __m128i*)table1); /*x2*w13+x0*w12, x2*w9+x0*w8, x2*w5+x0*w4, x2*w1+x0*w0*/ \
133 tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2); \
134 \
135 tmp2 = _mm_shuffle_epi32(row0, 0xAA); /*x6, x4, x6, x4, x6, x4, x6, x4*/ \
136 tmp6 = _mm_shuffle_epi32(row2, 0xAA); \
137 row0 = _mm_shuffle_epi32(row0, 0xFF); /*x7, x5, x7, x5, x7, x5, x7, x5*/ \
138 row2 = _mm_shuffle_epi32(row2, 0xFF); \
139\
140 tmp3 = _mm_madd_epi16(tmp3, * ( __m128i*)(table1+16)); /*x3*w29+x1*w28, x3*w25+x1*w24, x3*w21+x1*w20, x3*w17+x1*w16*/ \
141 tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \
142 row0 = _mm_madd_epi16(row0, * ( __m128i*)(table1+24)); /*x7*w31+x5*w30, x7*w27+x5*w26, x7*w23+x5*w22, x7*w19+x5*w18*/ \
143 row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \
144 tmp2 = _mm_madd_epi16(tmp2, * ( __m128i*)(table1+8) ); /*x6*w15+x4*w14, x6*w11+x4*w10, x6*w7+x4*w6, x6*w3+x4*w2*/ \
145 tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) ); \
146 \
147 tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row); \
148 tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row); \
149 row0 = _mm_add_epi32(row0, tmp3); /*b3, b2, b1, b0*/ \
150 row2 = _mm_add_epi32(row2, tmp7); \
151 tmp1 = _mm_add_epi32(tmp1, tmp2); /*a3, a2, a1, a0*/ \
152 tmp5 = _mm_add_epi32(tmp5, tmp6); \
153 \
154 tmp2 = tmp1; \
155 tmp6 = tmp5; \
156 tmp2 = _mm_sub_epi32(tmp2, row0); /*for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 */ \
157 tmp6 = _mm_sub_epi32(tmp6, row2); \
158 row0 = _mm_add_epi32(row0, tmp1); /*y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0*/ \
159 row2 = _mm_add_epi32(row2, tmp5); \
160 tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW); \
161 tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW); \
162 row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW); \
163 row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW); \
164 tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /*y7, y6, y5, y4*/ \
165 tmp6 = _mm_shuffle_epi32(tmp6, 0x1B); \
166 row0 = _mm_packs_epi32(row0, tmp2); /*row0 = y7,y6,y5,y4,y3,y2,y1,y0*/ \
167 row2 = _mm_packs_epi32(row2, tmp6); /*row2 = y7,...y0*/
168
169
170#define iDCT_8_COL() \
171 x3 = _mm_load_si128(( __m128i*)(wsptr+24));\
172 x1 = _mm_load_si128(( __m128i*)(wsptr+8));\
173 x5 = row0;\
174 x7 = row2;\
175\
176 tg3 = ( __m128i*)(M128_tg_3_16);\
177 tg1 = ( __m128i*)(M128_tg_1_16);\
178 tg2 = ( __m128i*)(M128_tg_2_16);\
179 cos4 =(__m128i*)(M128_cos_4_16);\
180\
181 temp = _mm_mulhi_epi16(x5, *tg3); /*row5*tg3*/ \
182 temp2 = _mm_mulhi_epi16(x3, *tg3);\
183 temp = _mm_adds_epi16(temp, x5); /*coef adjustment*/ \
184 temp2 = _mm_adds_epi16(temp2, x3);\
185 tm765 = _mm_adds_epi16(temp, x3);\
186 tm465 = _mm_subs_epi16(x5, temp2);\
187\
188 temp = _mm_mulhi_epi16(x7, *tg1); /*row7*tg1*/ \
189 temp2 = _mm_mulhi_epi16(x1, *tg1);\
190 tp765 = _mm_adds_epi16(temp, x1);\
191 tp465 = _mm_subs_epi16(temp2, x7); /*row1*tg1 - row7*/ \
192\
193 t7 = _mm_adds_epi16(tp765, tm765);\
194 t7 = _mm_adds_epi16(t7, *( __m128i*)M128_one_corr);\
195 tp65 = _mm_subs_epi16(tp765, tm765);\
196 t4 = _mm_adds_epi16(tp465, tm465);\
197 tm65 = _mm_subs_epi16(tp465, tm465);\
198 tm65 = _mm_adds_epi16(tm65, *( __m128i*)M128_one_corr);\
199\
200 x0 = _mm_load_si128(( __m128i*)(wsptr));\
201 x4 = _mm_load_si128(( __m128i*)(wsptr+32));\
202 x2 = _mm_load_si128(( __m128i*)(wsptr+16));\
203 x6 = _mm_load_si128(( __m128i*)(wsptr+48));\
204\
205 /*t6 = ( tp65 + tm65 ) * cos_4_16;*/ \
206 temp = _mm_adds_epi16(tp65, tm65);\
207 temp2 = _mm_subs_epi16(tp65, tm65);\
208 t6 = _mm_mulhi_epi16(temp, *cos4);\
209 t5 = _mm_mulhi_epi16(temp2, *cos4);\
210 t6 = _mm_adds_epi16(t6, temp);\
211 t6 = _mm_or_si128(t6, *( __m128i*)M128_one_corr);\
212 t5 = _mm_adds_epi16(t5, temp2);\
213 t5 = _mm_or_si128(t5, *( __m128i*)M128_one_corr);\
214\
215 tp03 = _mm_adds_epi16(x0, x4);\
216 tp12 = _mm_subs_epi16(x0, x4);\
217\
218 temp = _mm_mulhi_epi16(x6, *tg2);\
219 temp2 = _mm_mulhi_epi16(x2, *tg2);\
220 tm03 = _mm_adds_epi16(temp, x2);\
221 tm12 = _mm_subs_epi16(temp2, x6);\
222\
223 t0 = _mm_adds_epi16(tp03, tm03);\
224 t0 = _mm_adds_epi16(t0, *( __m128i*)M128_round_inv_col);\
225 t3 = _mm_subs_epi16(tp03, tm03);\
226 t3 = _mm_adds_epi16(t3, *( __m128i*)M128_round_inv_corr);\
227 t1 = _mm_adds_epi16(tp12, tm12);\
228 t1 = _mm_adds_epi16(t1, *( __m128i*)M128_round_inv_col);\
229 t2 = _mm_subs_epi16(tp12, tm12);\
230 t2 = _mm_adds_epi16(t2, *( __m128i*)M128_round_inv_corr);\
231\
232 temp = _mm_adds_epi16(t0, t7); /*y0*/ \
233 temp2 = _mm_adds_epi16(t1, t6); /*y1*/ \
234 temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
235 temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
236 temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust); /*Add 128 for jpeg decoding*/ \
237 temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
238\
239 temp = _mm_packus_epi16(temp, temp2);\
240 _mm_store_si128(( __m128i*)(outptr), temp); /*store y0, y1*/ \
241\
242 temp = _mm_adds_epi16(t2, t5);\
243 temp2 = _mm_adds_epi16(t3, t4);\
244 temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
245 temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
246 temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
247 temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
248\
249 temp = _mm_packus_epi16(temp, temp2);\
250 _mm_store_si128(( __m128i*)(outptr+16), temp); /*store y2, y3*/ \
251\
252 temp = _mm_subs_epi16(t3, t4);\
253 temp2 = _mm_subs_epi16(t2, t5);\
254 temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
255 temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
256 temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
257 temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
258\
259 temp = _mm_packus_epi16(temp, temp2);\
260 _mm_store_si128(( __m128i*)(outptr+32), temp); /*store y4, y5*/ \
261\
262 temp = _mm_subs_epi16(t1, t6);\
263 temp2 = _mm_subs_epi16(t0, t7);\
264 temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
265 temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
266 temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
267 temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
268\
269 temp = _mm_packus_epi16(temp, temp2);\
270 _mm_store_si128(( __m128i*)(outptr+48), temp); /*store y6, y7*/
271
272
273 /*Memcpy to do 16byte alignment. */
274 memcpy((char*)quantptrSSE, (char*)compptr->dct_table, sizeof(quantptrSSE));
275 memcpy((char*)coef_blockSSE, (char*)coef_block, sizeof(coef_blockSSE));
276
277 wsptr = (short *)workspaceSSE;
278 outptr = (unsigned char*)workspaceSSE;
279
280 // row 0 and row 2
281 row0 = _mm_load_si128((__m128i const*)(coef_blockSSE));
282 row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*2));
283 row0 = _mm_mullo_epi16( row0, *(__m128i const*)quantptrSSE );
284 row2 = _mm_mullo_epi16( row2, *(__m128i const*)(quantptrSSE+8*2) );
285
286 iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
287
288 _mm_store_si128((__m128i*)(wsptr), row0);
289 _mm_store_si128((__m128i*)(wsptr+8*2), row2);
290
291 // row 4 and row 6
292 row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*4));
293 row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*6));
294 row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+8*4) );
295 row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8*6) );
296
297 iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
298
299 _mm_store_si128((__m128i*)(wsptr+32), row0);
300 _mm_store_si128((__m128i*)(wsptr+48), row2);
301
302 // row 3 and row 1
303 row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*3));
304 row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*1));
305 row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+24) );
306 row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8) );
307
308 iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17);
309
310 _mm_store_si128((__m128i*)(wsptr+24), row0);
311 _mm_store_si128((__m128i*)(wsptr+8), row2);
312
313 // row 5 and row 7
314 row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*5));
315 row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*7));
316 row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+40) );
317 row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+56));
318
319 iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17);
320
321 iDCT_8_COL();
322
323 for(ctr = 0; ctr < DCTSIZE; ctr++)
324 {
325 outptrTemp = output_buf[ctr] + output_col;
326 memcpy(outptrTemp, outptr, DCTSIZE);
327 outptr += DCTSIZE; /* advance pointer to next row */
328 }
329
330 return;
331}
332#endif /* ANDROID_INTELSSE2_IDCT */