tengfei.zhao | 6553d24 | 2012-07-04 15:50:59 +0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2010-2011 Intel Corporation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #define JPEG_INTERNALS |
| 18 | #include "jinclude.h" |
| 19 | #include "jpeglib.h" |
| 20 | #include "jdct.h" /* Private declarations for DCT subsystem */ |
| 21 | |
| 22 | #ifdef ANDROID_INTELSSE2_IDCT |
| 23 | #include <emmintrin.h> |
| 24 | |
| 25 | #if DCTSIZE != 8 |
| 26 | Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ |
| 27 | #endif |
| 28 | |
| 29 | #define BITS_INV_ACC 4 |
| 30 | #define SHIFT_INV_ROW 12 |
| 31 | #define SHIFT_INV_COL 5 |
| 32 | const short RND_INV_ROW = 2048; |
| 33 | const short RND_INV_COL = 16; |
| 34 | const short RND_INV_CORR = 15; |
| 35 | |
| 36 | static const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1}; |
| 37 | static const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0}; |
| 38 | static const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16}; |
| 39 | static const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15}; |
| 40 | |
| 41 | static const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; |
| 42 | static const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; |
| 43 | static const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; |
| 44 | static const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195}; |
| 45 | |
| 46 | static const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128}; |
| 47 | |
| 48 | // Table for rows 0,4 |
| 49 | static const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = { |
| 50 | 16384, 21407, 16384, 8867, |
| 51 | 16384, -8867, 16384, -21407, |
| 52 | 16384, 8867, -16384, -21407, |
| 53 | -16384, 21407, 16384, -8867, |
| 54 | 22725, 19266, 19266, -4520, |
| 55 | 12873, -22725, 4520, -12873, |
| 56 | 12873, 4520, -22725, -12873, |
| 57 | 4520, 19266, 19266, -22725 |
| 58 | }; |
| 59 | |
| 60 | // Table for rows 1,7 |
| 61 | static const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = { |
| 62 | 22725, 29692, 22725, 12299, |
| 63 | 22725, -12299, 22725, -29692, |
| 64 | 22725, 12299, -22725, -29692, |
| 65 | -22725, 29692, 22725, -12299, |
| 66 | 31521, 26722, 26722, -6270, |
| 67 | 17855, -31521, 6270, -17855, |
| 68 | 17855, 6270, -31521, -17855, |
| 69 | 6270, 26722, 26722, -31521 |
| 70 | }; |
| 71 | |
| 72 | // Table for rows 2,6 |
| 73 | static const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = { |
| 74 | 21407, 27969, 21407, 11585, |
| 75 | 21407, -11585, 21407, -27969, |
| 76 | 21407, 11585, -21407, -27969, |
| 77 | -21407, 27969, 21407, -11585, |
| 78 | 29692, 25172, 25172, -5906, |
| 79 | 16819, -29692, 5906, -16819, |
| 80 | 16819, 5906, -29692, -16819, |
| 81 | 5906, 25172, 25172, -29692 |
| 82 | }; |
| 83 | |
| 84 | // Table for rows 3,5 |
| 85 | static const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = { |
| 86 | 19266, 25172, 19266, 10426, |
| 87 | 19266, -10426, 19266, -25172, |
| 88 | 19266, 10426, -19266, -25172, |
| 89 | -19266, 25172, 19266, -10426, |
| 90 | 26722, 22654, 22654, -5315, |
| 91 | 15137, -26722, 5315, -15137, |
| 92 | 15137, 5315, -26722, -15137, |
| 93 | 5315, 22654, 22654, -26722 |
| 94 | }; |
| 95 | |
| 96 | |
| 97 | /* |
| 98 | * Perform dequantization and inverse DCT on one block of coefficients by SSE. |
| 99 | */ |
| 100 | |
| 101 | GLOBAL(void) |
| 102 | jpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
| 103 | JCOEFPTR coef_block, |
| 104 | JSAMPARRAY output_buf, JDIMENSION output_col) |
| 105 | { |
| 106 | __m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7; |
| 107 | int ctr; |
| 108 | JSAMPROW outptrTemp; |
| 109 | JSAMPLE *range_limit = IDCT_range_limit(cinfo); |
| 110 | short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2]; |
| 111 | short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2]; |
| 112 | short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2]; |
| 113 | __m128i x0, x1, x2, x3, x4, x5, x6, x7; |
| 114 | __m128i* tg3, *tg1, *tg2, *cos4; |
| 115 | __m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65; |
| 116 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
| 117 | __m128i temp, temp2; |
| 118 | short * wsptr; |
| 119 | unsigned char * outptr; |
| 120 | |
| 121 | #define iDCT_8_2ROWs(table1, table2) \ |
| 122 | row0 = _mm_shufflelo_epi16(row0, 0xD8); /*x7, x6, x5, x4, x3, x1, x2, x0*/ \ |
| 123 | row2 = _mm_shufflelo_epi16(row2, 0xD8); \ |
| 124 | tmp1 = _mm_shuffle_epi32(row0, 0); /*x2, x0, x2, x0, x2, x0, x2, x0*/ \ |
| 125 | tmp5 = _mm_shuffle_epi32(row2, 0); \ |
| 126 | \ |
| 127 | tmp3 = _mm_shuffle_epi32(row0, 0x55); /*x3, x1, x3, x1, x3, x1, x3, x1*/ \ |
| 128 | tmp7 = _mm_shuffle_epi32(row2, 0x55); \ |
| 129 | row0 = _mm_shufflehi_epi16(row0, 0xD8); /*x7, x5, x6, x4, x3, x1, x2, x0*/ \ |
| 130 | row2 = _mm_shufflehi_epi16(row2, 0xD8); \ |
| 131 | \ |
| 132 | tmp1 = _mm_madd_epi16(tmp1, * ( __m128i*)table1); /*x2*w13+x0*w12, x2*w9+x0*w8, x2*w5+x0*w4, x2*w1+x0*w0*/ \ |
| 133 | tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2); \ |
| 134 | \ |
| 135 | tmp2 = _mm_shuffle_epi32(row0, 0xAA); /*x6, x4, x6, x4, x6, x4, x6, x4*/ \ |
| 136 | tmp6 = _mm_shuffle_epi32(row2, 0xAA); \ |
| 137 | row0 = _mm_shuffle_epi32(row0, 0xFF); /*x7, x5, x7, x5, x7, x5, x7, x5*/ \ |
| 138 | row2 = _mm_shuffle_epi32(row2, 0xFF); \ |
| 139 | \ |
| 140 | tmp3 = _mm_madd_epi16(tmp3, * ( __m128i*)(table1+16)); /*x3*w29+x1*w28, x3*w25+x1*w24, x3*w21+x1*w20, x3*w17+x1*w16*/ \ |
| 141 | tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \ |
| 142 | row0 = _mm_madd_epi16(row0, * ( __m128i*)(table1+24)); /*x7*w31+x5*w30, x7*w27+x5*w26, x7*w23+x5*w22, x7*w19+x5*w18*/ \ |
| 143 | row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \ |
| 144 | tmp2 = _mm_madd_epi16(tmp2, * ( __m128i*)(table1+8) ); /*x6*w15+x4*w14, x6*w11+x4*w10, x6*w7+x4*w6, x6*w3+x4*w2*/ \ |
| 145 | tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) ); \ |
| 146 | \ |
| 147 | tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row); \ |
| 148 | tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row); \ |
| 149 | row0 = _mm_add_epi32(row0, tmp3); /*b3, b2, b1, b0*/ \ |
| 150 | row2 = _mm_add_epi32(row2, tmp7); \ |
| 151 | tmp1 = _mm_add_epi32(tmp1, tmp2); /*a3, a2, a1, a0*/ \ |
| 152 | tmp5 = _mm_add_epi32(tmp5, tmp6); \ |
| 153 | \ |
| 154 | tmp2 = tmp1; \ |
| 155 | tmp6 = tmp5; \ |
| 156 | tmp2 = _mm_sub_epi32(tmp2, row0); /*for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 */ \ |
| 157 | tmp6 = _mm_sub_epi32(tmp6, row2); \ |
| 158 | row0 = _mm_add_epi32(row0, tmp1); /*y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0*/ \ |
| 159 | row2 = _mm_add_epi32(row2, tmp5); \ |
| 160 | tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW); \ |
| 161 | tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW); \ |
| 162 | row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW); \ |
| 163 | row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW); \ |
| 164 | tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /*y7, y6, y5, y4*/ \ |
| 165 | tmp6 = _mm_shuffle_epi32(tmp6, 0x1B); \ |
| 166 | row0 = _mm_packs_epi32(row0, tmp2); /*row0 = y7,y6,y5,y4,y3,y2,y1,y0*/ \ |
| 167 | row2 = _mm_packs_epi32(row2, tmp6); /*row2 = y7,...y0*/ |
| 168 | |
| 169 | |
| 170 | #define iDCT_8_COL() \ |
| 171 | x3 = _mm_load_si128(( __m128i*)(wsptr+24));\ |
| 172 | x1 = _mm_load_si128(( __m128i*)(wsptr+8));\ |
| 173 | x5 = row0;\ |
| 174 | x7 = row2;\ |
| 175 | \ |
| 176 | tg3 = ( __m128i*)(M128_tg_3_16);\ |
| 177 | tg1 = ( __m128i*)(M128_tg_1_16);\ |
| 178 | tg2 = ( __m128i*)(M128_tg_2_16);\ |
| 179 | cos4 =(__m128i*)(M128_cos_4_16);\ |
| 180 | \ |
| 181 | temp = _mm_mulhi_epi16(x5, *tg3); /*row5*tg3*/ \ |
| 182 | temp2 = _mm_mulhi_epi16(x3, *tg3);\ |
| 183 | temp = _mm_adds_epi16(temp, x5); /*coef adjustment*/ \ |
| 184 | temp2 = _mm_adds_epi16(temp2, x3);\ |
| 185 | tm765 = _mm_adds_epi16(temp, x3);\ |
| 186 | tm465 = _mm_subs_epi16(x5, temp2);\ |
| 187 | \ |
| 188 | temp = _mm_mulhi_epi16(x7, *tg1); /*row7*tg1*/ \ |
| 189 | temp2 = _mm_mulhi_epi16(x1, *tg1);\ |
| 190 | tp765 = _mm_adds_epi16(temp, x1);\ |
| 191 | tp465 = _mm_subs_epi16(temp2, x7); /*row1*tg1 - row7*/ \ |
| 192 | \ |
| 193 | t7 = _mm_adds_epi16(tp765, tm765);\ |
| 194 | t7 = _mm_adds_epi16(t7, *( __m128i*)M128_one_corr);\ |
| 195 | tp65 = _mm_subs_epi16(tp765, tm765);\ |
| 196 | t4 = _mm_adds_epi16(tp465, tm465);\ |
| 197 | tm65 = _mm_subs_epi16(tp465, tm465);\ |
| 198 | tm65 = _mm_adds_epi16(tm65, *( __m128i*)M128_one_corr);\ |
| 199 | \ |
| 200 | x0 = _mm_load_si128(( __m128i*)(wsptr));\ |
| 201 | x4 = _mm_load_si128(( __m128i*)(wsptr+32));\ |
| 202 | x2 = _mm_load_si128(( __m128i*)(wsptr+16));\ |
| 203 | x6 = _mm_load_si128(( __m128i*)(wsptr+48));\ |
| 204 | \ |
| 205 | /*t6 = ( tp65 + tm65 ) * cos_4_16;*/ \ |
| 206 | temp = _mm_adds_epi16(tp65, tm65);\ |
| 207 | temp2 = _mm_subs_epi16(tp65, tm65);\ |
| 208 | t6 = _mm_mulhi_epi16(temp, *cos4);\ |
| 209 | t5 = _mm_mulhi_epi16(temp2, *cos4);\ |
| 210 | t6 = _mm_adds_epi16(t6, temp);\ |
| 211 | t6 = _mm_or_si128(t6, *( __m128i*)M128_one_corr);\ |
| 212 | t5 = _mm_adds_epi16(t5, temp2);\ |
| 213 | t5 = _mm_or_si128(t5, *( __m128i*)M128_one_corr);\ |
| 214 | \ |
| 215 | tp03 = _mm_adds_epi16(x0, x4);\ |
| 216 | tp12 = _mm_subs_epi16(x0, x4);\ |
| 217 | \ |
| 218 | temp = _mm_mulhi_epi16(x6, *tg2);\ |
| 219 | temp2 = _mm_mulhi_epi16(x2, *tg2);\ |
| 220 | tm03 = _mm_adds_epi16(temp, x2);\ |
| 221 | tm12 = _mm_subs_epi16(temp2, x6);\ |
| 222 | \ |
| 223 | t0 = _mm_adds_epi16(tp03, tm03);\ |
| 224 | t0 = _mm_adds_epi16(t0, *( __m128i*)M128_round_inv_col);\ |
| 225 | t3 = _mm_subs_epi16(tp03, tm03);\ |
| 226 | t3 = _mm_adds_epi16(t3, *( __m128i*)M128_round_inv_corr);\ |
| 227 | t1 = _mm_adds_epi16(tp12, tm12);\ |
| 228 | t1 = _mm_adds_epi16(t1, *( __m128i*)M128_round_inv_col);\ |
| 229 | t2 = _mm_subs_epi16(tp12, tm12);\ |
| 230 | t2 = _mm_adds_epi16(t2, *( __m128i*)M128_round_inv_corr);\ |
| 231 | \ |
| 232 | temp = _mm_adds_epi16(t0, t7); /*y0*/ \ |
| 233 | temp2 = _mm_adds_epi16(t1, t6); /*y1*/ \ |
| 234 | temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ |
| 235 | temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ |
| 236 | temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust); /*Add 128 for jpeg decoding*/ \ |
| 237 | temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ |
| 238 | \ |
| 239 | temp = _mm_packus_epi16(temp, temp2);\ |
| 240 | _mm_store_si128(( __m128i*)(outptr), temp); /*store y0, y1*/ \ |
| 241 | \ |
| 242 | temp = _mm_adds_epi16(t2, t5);\ |
| 243 | temp2 = _mm_adds_epi16(t3, t4);\ |
| 244 | temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ |
| 245 | temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ |
| 246 | temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ |
| 247 | temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ |
| 248 | \ |
| 249 | temp = _mm_packus_epi16(temp, temp2);\ |
| 250 | _mm_store_si128(( __m128i*)(outptr+16), temp); /*store y2, y3*/ \ |
| 251 | \ |
| 252 | temp = _mm_subs_epi16(t3, t4);\ |
| 253 | temp2 = _mm_subs_epi16(t2, t5);\ |
| 254 | temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ |
| 255 | temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ |
| 256 | temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ |
| 257 | temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ |
| 258 | \ |
| 259 | temp = _mm_packus_epi16(temp, temp2);\ |
| 260 | _mm_store_si128(( __m128i*)(outptr+32), temp); /*store y4, y5*/ \ |
| 261 | \ |
| 262 | temp = _mm_subs_epi16(t1, t6);\ |
| 263 | temp2 = _mm_subs_epi16(t0, t7);\ |
| 264 | temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ |
| 265 | temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ |
| 266 | temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ |
| 267 | temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ |
| 268 | \ |
| 269 | temp = _mm_packus_epi16(temp, temp2);\ |
| 270 | _mm_store_si128(( __m128i*)(outptr+48), temp); /*store y6, y7*/ |
| 271 | |
| 272 | |
| 273 | /*Memcpy to do 16byte alignment. */ |
| 274 | memcpy((char*)quantptrSSE, (char*)compptr->dct_table, sizeof(quantptrSSE)); |
| 275 | memcpy((char*)coef_blockSSE, (char*)coef_block, sizeof(coef_blockSSE)); |
| 276 | |
| 277 | wsptr = (short *)workspaceSSE; |
| 278 | outptr = (unsigned char*)workspaceSSE; |
| 279 | |
| 280 | // row 0 and row 2 |
| 281 | row0 = _mm_load_si128((__m128i const*)(coef_blockSSE)); |
| 282 | row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*2)); |
| 283 | row0 = _mm_mullo_epi16( row0, *(__m128i const*)quantptrSSE ); |
| 284 | row2 = _mm_mullo_epi16( row2, *(__m128i const*)(quantptrSSE+8*2) ); |
| 285 | |
| 286 | iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26); |
| 287 | |
| 288 | _mm_store_si128((__m128i*)(wsptr), row0); |
| 289 | _mm_store_si128((__m128i*)(wsptr+8*2), row2); |
| 290 | |
| 291 | // row 4 and row 6 |
| 292 | row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*4)); |
| 293 | row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*6)); |
| 294 | row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+8*4) ); |
| 295 | row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8*6) ); |
| 296 | |
| 297 | iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26); |
| 298 | |
| 299 | _mm_store_si128((__m128i*)(wsptr+32), row0); |
| 300 | _mm_store_si128((__m128i*)(wsptr+48), row2); |
| 301 | |
| 302 | // row 3 and row 1 |
| 303 | row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*3)); |
| 304 | row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*1)); |
| 305 | row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+24) ); |
| 306 | row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8) ); |
| 307 | |
| 308 | iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17); |
| 309 | |
| 310 | _mm_store_si128((__m128i*)(wsptr+24), row0); |
| 311 | _mm_store_si128((__m128i*)(wsptr+8), row2); |
| 312 | |
| 313 | // row 5 and row 7 |
| 314 | row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*5)); |
| 315 | row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*7)); |
| 316 | row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+40) ); |
| 317 | row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+56)); |
| 318 | |
| 319 | iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17); |
| 320 | |
| 321 | iDCT_8_COL(); |
| 322 | |
| 323 | for(ctr = 0; ctr < DCTSIZE; ctr++) |
| 324 | { |
| 325 | outptrTemp = output_buf[ctr] + output_col; |
| 326 | memcpy(outptrTemp, outptr, DCTSIZE); |
| 327 | outptr += DCTSIZE; /* advance pointer to next row */ |
| 328 | } |
| 329 | |
| 330 | return; |
| 331 | } |
| 332 | #endif /* ANDROID_INTELSSE2_IDCT */ |