Blame - jidctintelsse.c - fp2-dev/platform/external/jpeg

blob: cb5c93b0f2a8fcca565fdbca5f41e1cb851deb86 [file] [log] [blame]

tengfei.zhao	6553d24	2012-07-04 15:50:59 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2010-2011 Intel Corporation
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#define JPEG_INTERNALS
				18	#include "jinclude.h"
				19	#include "jpeglib.h"
				20	#include "jdct.h" /* Private declarations for DCT subsystem */
				21
				22	#ifdef ANDROID_INTELSSE2_IDCT
				23	#include <emmintrin.h>
				24
				25	#if DCTSIZE != 8
				26	Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
				27	#endif
				28
				29	#define BITS_INV_ACC 4
				30	#define SHIFT_INV_ROW 12
				31	#define SHIFT_INV_COL 5
				32	const short RND_INV_ROW = 2048;
				33	const short RND_INV_COL = 16;
				34	const short RND_INV_CORR = 15;
				35
				36	static const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1};
				37	static const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0};
				38	static const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16};
				39	static const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15};
				40
				41	static const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
				42	static const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
				43	static const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
				44	static const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
				45
				46	static const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128};
				47
				48	// Table for rows 0,4
				49	static const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = {
				50	16384, 21407, 16384, 8867,
				51	16384, -8867, 16384, -21407,
				52	16384, 8867, -16384, -21407,
				53	-16384, 21407, 16384, -8867,
				54	22725, 19266, 19266, -4520,
				55	12873, -22725, 4520, -12873,
				56	12873, 4520, -22725, -12873,
				57	4520, 19266, 19266, -22725
				58	};
				59
				60	// Table for rows 1,7
				61	static const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = {
				62	22725, 29692, 22725, 12299,
				63	22725, -12299, 22725, -29692,
				64	22725, 12299, -22725, -29692,
				65	-22725, 29692, 22725, -12299,
				66	31521, 26722, 26722, -6270,
				67	17855, -31521, 6270, -17855,
				68	17855, 6270, -31521, -17855,
				69	6270, 26722, 26722, -31521
				70	};
				71
				72	// Table for rows 2,6
				73	static const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = {
				74	21407, 27969, 21407, 11585,
				75	21407, -11585, 21407, -27969,
				76	21407, 11585, -21407, -27969,
				77	-21407, 27969, 21407, -11585,
				78	29692, 25172, 25172, -5906,
				79	16819, -29692, 5906, -16819,
				80	16819, 5906, -29692, -16819,
				81	5906, 25172, 25172, -29692
				82	};
				83
				84	// Table for rows 3,5
				85	static const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = {
				86	19266, 25172, 19266, 10426,
				87	19266, -10426, 19266, -25172,
				88	19266, 10426, -19266, -25172,
				89	-19266, 25172, 19266, -10426,
				90	26722, 22654, 22654, -5315,
				91	15137, -26722, 5315, -15137,
				92	15137, 5315, -26722, -15137,
				93	5315, 22654, 22654, -26722
				94	};
				95
				96
				97	/*
				98	* Perform dequantization and inverse DCT on one block of coefficients by SSE.
				99	*/
				100
				101	GLOBAL(void)
				102	jpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
				103	JCOEFPTR coef_block,
				104	JSAMPARRAY output_buf, JDIMENSION output_col)
				105	{
				106	__m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7;
				107	int ctr;
				108	JSAMPROW outptrTemp;
				109	JSAMPLE *range_limit = IDCT_range_limit(cinfo);
				110	short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2];
				111	short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2];
				112	short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2];
				113	__m128i x0, x1, x2, x3, x4, x5, x6, x7;
				114	__m128i* tg3, tg1, tg2, *cos4;
				115	__m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65;
				116	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
				117	__m128i temp, temp2;
				118	short * wsptr;
				119	unsigned char * outptr;
				120
				121	#define iDCT_8_2ROWs(table1, table2) \
				122	row0 = _mm_shufflelo_epi16(row0, 0xD8); /x7, x6, x5, x4, x3, x1, x2, x0/ \
				123	row2 = _mm_shufflelo_epi16(row2, 0xD8); \
				124	tmp1 = _mm_shuffle_epi32(row0, 0); /x2, x0, x2, x0, x2, x0, x2, x0/ \
				125	tmp5 = _mm_shuffle_epi32(row2, 0); \
				126	\
				127	tmp3 = _mm_shuffle_epi32(row0, 0x55); /x3, x1, x3, x1, x3, x1, x3, x1/ \
				128	tmp7 = _mm_shuffle_epi32(row2, 0x55); \
				129	row0 = _mm_shufflehi_epi16(row0, 0xD8); /x7, x5, x6, x4, x3, x1, x2, x0/ \
				130	row2 = _mm_shufflehi_epi16(row2, 0xD8); \
				131	\
				132	tmp1 = _mm_madd_epi16(tmp1, * ( __m128i)table1); /x2w13+x0w12, x2w9+x0w8, x2w5+x0w4, x2w1+x0w0*/ \
				133	tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2); \
				134	\
				135	tmp2 = _mm_shuffle_epi32(row0, 0xAA); /x6, x4, x6, x4, x6, x4, x6, x4/ \
				136	tmp6 = _mm_shuffle_epi32(row2, 0xAA); \
				137	row0 = _mm_shuffle_epi32(row0, 0xFF); /x7, x5, x7, x5, x7, x5, x7, x5/ \
				138	row2 = _mm_shuffle_epi32(row2, 0xFF); \
				139	\
				140	tmp3 = _mm_madd_epi16(tmp3, * ( __m128i)(table1+16)); /x3w29+x1w28, x3w25+x1w24, x3w21+x1w20, x3w17+x1w16*/ \
				141	tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \
				142	row0 = _mm_madd_epi16(row0, * ( __m128i)(table1+24)); /x7w31+x5w30, x7w27+x5w26, x7w23+x5w22, x7w19+x5w18*/ \
				143	row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \
				144	tmp2 = _mm_madd_epi16(tmp2, * ( __m128i)(table1+8) ); /x6w15+x4w14, x6w11+x4w10, x6w7+x4w6, x6w3+x4w2*/ \
				145	tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) ); \
				146	\
				147	tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row); \
				148	tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row); \
				149	row0 = _mm_add_epi32(row0, tmp3); /b3, b2, b1, b0/ \
				150	row2 = _mm_add_epi32(row2, tmp7); \
				151	tmp1 = _mm_add_epi32(tmp1, tmp2); /a3, a2, a1, a0/ \
				152	tmp5 = _mm_add_epi32(tmp5, tmp6); \
				153	\
				154	tmp2 = tmp1; \
				155	tmp6 = tmp5; \
				156	tmp2 = _mm_sub_epi32(tmp2, row0); /for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 / \
				157	tmp6 = _mm_sub_epi32(tmp6, row2); \
				158	row0 = _mm_add_epi32(row0, tmp1); /y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0/ \
				159	row2 = _mm_add_epi32(row2, tmp5); \
				160	tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW); \
				161	tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW); \
				162	row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW); \
				163	row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW); \
				164	tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /y7, y6, y5, y4/ \
				165	tmp6 = _mm_shuffle_epi32(tmp6, 0x1B); \
				166	row0 = _mm_packs_epi32(row0, tmp2); /row0 = y7,y6,y5,y4,y3,y2,y1,y0/ \
				167	row2 = _mm_packs_epi32(row2, tmp6); /row2 = y7,...y0/
				168
				169
				170	#define iDCT_8_COL() \
				171	x3 = _mm_load_si128(( __m128i*)(wsptr+24));\
				172	x1 = _mm_load_si128(( __m128i*)(wsptr+8));\
				173	x5 = row0;\
				174	x7 = row2;\
				175	\
				176	tg3 = ( __m128i*)(M128_tg_3_16);\
				177	tg1 = ( __m128i*)(M128_tg_1_16);\
				178	tg2 = ( __m128i*)(M128_tg_2_16);\
				179	cos4 =(__m128i*)(M128_cos_4_16);\
				180	\
				181	temp = _mm_mulhi_epi16(x5, tg3); /row5tg3/ \
				182	temp2 = _mm_mulhi_epi16(x3, *tg3);\
				183	temp = _mm_adds_epi16(temp, x5); /coef adjustment/ \
				184	temp2 = _mm_adds_epi16(temp2, x3);\
				185	tm765 = _mm_adds_epi16(temp, x3);\
				186	tm465 = _mm_subs_epi16(x5, temp2);\
				187	\
				188	temp = _mm_mulhi_epi16(x7, tg1); /row7tg1/ \
				189	temp2 = _mm_mulhi_epi16(x1, *tg1);\
				190	tp765 = _mm_adds_epi16(temp, x1);\
				191	tp465 = _mm_subs_epi16(temp2, x7); /row1tg1 - row7*/ \
				192	\
				193	t7 = _mm_adds_epi16(tp765, tm765);\
				194	t7 = _mm_adds_epi16(t7, ( __m128i)M128_one_corr);\
				195	tp65 = _mm_subs_epi16(tp765, tm765);\
				196	t4 = _mm_adds_epi16(tp465, tm465);\
				197	tm65 = _mm_subs_epi16(tp465, tm465);\
				198	tm65 = _mm_adds_epi16(tm65, ( __m128i)M128_one_corr);\
				199	\
				200	x0 = _mm_load_si128(( __m128i*)(wsptr));\
				201	x4 = _mm_load_si128(( __m128i*)(wsptr+32));\
				202	x2 = _mm_load_si128(( __m128i*)(wsptr+16));\
				203	x6 = _mm_load_si128(( __m128i*)(wsptr+48));\
				204	\
				205	/t6 = ( tp65 + tm65 ) cos_4_16;*/ \
				206	temp = _mm_adds_epi16(tp65, tm65);\
				207	temp2 = _mm_subs_epi16(tp65, tm65);\
				208	t6 = _mm_mulhi_epi16(temp, *cos4);\
				209	t5 = _mm_mulhi_epi16(temp2, *cos4);\
				210	t6 = _mm_adds_epi16(t6, temp);\
				211	t6 = _mm_or_si128(t6, ( __m128i)M128_one_corr);\
				212	t5 = _mm_adds_epi16(t5, temp2);\
				213	t5 = _mm_or_si128(t5, ( __m128i)M128_one_corr);\
				214	\
				215	tp03 = _mm_adds_epi16(x0, x4);\
				216	tp12 = _mm_subs_epi16(x0, x4);\
				217	\
				218	temp = _mm_mulhi_epi16(x6, *tg2);\
				219	temp2 = _mm_mulhi_epi16(x2, *tg2);\
				220	tm03 = _mm_adds_epi16(temp, x2);\
				221	tm12 = _mm_subs_epi16(temp2, x6);\
				222	\
				223	t0 = _mm_adds_epi16(tp03, tm03);\
				224	t0 = _mm_adds_epi16(t0, ( __m128i)M128_round_inv_col);\
				225	t3 = _mm_subs_epi16(tp03, tm03);\
				226	t3 = _mm_adds_epi16(t3, ( __m128i)M128_round_inv_corr);\
				227	t1 = _mm_adds_epi16(tp12, tm12);\
				228	t1 = _mm_adds_epi16(t1, ( __m128i)M128_round_inv_col);\
				229	t2 = _mm_subs_epi16(tp12, tm12);\
				230	t2 = _mm_adds_epi16(t2, ( __m128i)M128_round_inv_corr);\
				231	\
				232	temp = _mm_adds_epi16(t0, t7); /y0/ \
				233	temp2 = _mm_adds_epi16(t1, t6); /y1/ \
				234	temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
				235	temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
				236	temp = _mm_adds_epi16(temp, ( __m128i)jpeg_adjust); /Add 128 for jpeg decoding/ \
				237	temp2 = _mm_adds_epi16(temp2, ( __m128i)jpeg_adjust);\
				238	\
				239	temp = _mm_packus_epi16(temp, temp2);\
				240	_mm_store_si128(( __m128i)(outptr), temp); /store y0, y1*/ \
				241	\
				242	temp = _mm_adds_epi16(t2, t5);\
				243	temp2 = _mm_adds_epi16(t3, t4);\
				244	temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
				245	temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
				246	temp = _mm_adds_epi16(temp, ( __m128i)jpeg_adjust);\
				247	temp2 = _mm_adds_epi16(temp2, ( __m128i)jpeg_adjust);\
				248	\
				249	temp = _mm_packus_epi16(temp, temp2);\
				250	_mm_store_si128(( __m128i)(outptr+16), temp); /store y2, y3*/ \
				251	\
				252	temp = _mm_subs_epi16(t3, t4);\
				253	temp2 = _mm_subs_epi16(t2, t5);\
				254	temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
				255	temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
				256	temp = _mm_adds_epi16(temp, ( __m128i)jpeg_adjust);\
				257	temp2 = _mm_adds_epi16(temp2, ( __m128i)jpeg_adjust);\
				258	\
				259	temp = _mm_packus_epi16(temp, temp2);\
				260	_mm_store_si128(( __m128i)(outptr+32), temp); /store y4, y5*/ \
				261	\
				262	temp = _mm_subs_epi16(t1, t6);\
				263	temp2 = _mm_subs_epi16(t0, t7);\
				264	temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
				265	temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
				266	temp = _mm_adds_epi16(temp, ( __m128i)jpeg_adjust);\
				267	temp2 = _mm_adds_epi16(temp2, ( __m128i)jpeg_adjust);\
				268	\
				269	temp = _mm_packus_epi16(temp, temp2);\
				270	_mm_store_si128(( __m128i)(outptr+48), temp); /store y6, y7*/
				271
				272
				273	/Memcpy to do 16byte alignment. /
				274	memcpy((char)quantptrSSE, (char)compptr->dct_table, sizeof(quantptrSSE));
				275	memcpy((char)coef_blockSSE, (char)coef_block, sizeof(coef_blockSSE));
				276
				277	wsptr = (short *)workspaceSSE;
				278	outptr = (unsigned char*)workspaceSSE;
				279
				280	// row 0 and row 2
				281	row0 = _mm_load_si128((__m128i const*)(coef_blockSSE));
				282	row2 = _mm_load_si128((__m128i const)(coef_blockSSE+82));
				283	row0 = _mm_mullo_epi16( row0, (__m128i const)quantptrSSE );
				284	row2 = _mm_mullo_epi16( row2, (__m128i const)(quantptrSSE+8*2) );
				285
				286	iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
				287
				288	_mm_store_si128((__m128i*)(wsptr), row0);
				289	_mm_store_si128((__m128i)(wsptr+82), row2);
				290
				291	// row 4 and row 6
				292	row0 = _mm_load_si128((__m128i const)(coef_blockSSE+84));
				293	row2 = _mm_load_si128((__m128i const)(coef_blockSSE+86));
				294	row0 = _mm_mullo_epi16(row0, (__m128i const)(quantptrSSE+8*4) );
				295	row2 = _mm_mullo_epi16(row2, (__m128i const)(quantptrSSE+8*6) );
				296
				297	iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
				298
				299	_mm_store_si128((__m128i*)(wsptr+32), row0);
				300	_mm_store_si128((__m128i*)(wsptr+48), row2);
				301
				302	// row 3 and row 1
				303	row0 = _mm_load_si128((__m128i const)(coef_blockSSE+83));
				304	row2 = _mm_load_si128((__m128i const)(coef_blockSSE+81));
				305	row0 = _mm_mullo_epi16(row0, (__m128i const)(quantptrSSE+24) );
				306	row2 = _mm_mullo_epi16(row2, (__m128i const)(quantptrSSE+8) );
				307
				308	iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17);
				309
				310	_mm_store_si128((__m128i*)(wsptr+24), row0);
				311	_mm_store_si128((__m128i*)(wsptr+8), row2);
				312
				313	// row 5 and row 7
				314	row0 = _mm_load_si128((__m128i const)(coef_blockSSE+85));
				315	row2 = _mm_load_si128((__m128i const)(coef_blockSSE+87));
				316	row0 = _mm_mullo_epi16(row0, (__m128i const)(quantptrSSE+40) );
				317	row2 = _mm_mullo_epi16(row2, (__m128i const)(quantptrSSE+56));
				318
				319	iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17);
				320
				321	iDCT_8_COL();
				322
				323	for(ctr = 0; ctr < DCTSIZE; ctr++)
				324	{
				325	outptrTemp = output_buf[ctr] + output_col;
				326	memcpy(outptrTemp, outptr, DCTSIZE);
				327	outptr += DCTSIZE; /* advance pointer to next row */
				328	}
				329
				330	return;
				331	}
				332	#endif /* ANDROID_INTELSSE2_IDCT */