Blame - simd/jidctfst-altivec.c - platform/external/libjpeg-turbo

blob: 282a97e00ced8fb36d7149cfc4951bc1b4654618 [file] [log] [blame]

DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	1	/*
				2	* AltiVec optimizations for libjpeg-turbo
				3	*
				4	* Copyright (C) 2014, D. R. Commander.
				5	* All rights reserved.
				6	* This software is provided 'as-is', without any express or implied
				7	* warranty. In no event will the authors be held liable for any damages
				8	* arising from the use of this software.
				9	*
				10	* Permission is granted to anyone to use this software for any purpose,
				11	* including commercial applications, and to alter it and redistribute it
				12	* freely, subject to the following restrictions:
				13	*
				14	* 1. The origin of this software must not be misrepresented; you must not
				15	* claim that you wrote the original software. If you use this software
				16	* in a product, an acknowledgment in the product documentation would be
				17	* appreciated but is not required.
				18	* 2. Altered source versions must be plainly marked as such, and must not be
				19	* misrepresented as being the original software.
				20	* 3. This notice may not be removed or altered from any source distribution.
				21	*/
				22
				23	/* FAST INTEGER INVERSE DCT
				24	*
				25	* This is similar to the SSE2 implementation, except that we left-shift the
				26	* constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
				27	* vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
				28	* the elements in arg3 + the most significant 17 bits of
				29	* (the elements in arg1 * the elements in arg2).
				30	*/
				31
				32	#include "jsimd_altivec.h"
				33
				34
				35	#define F_1_082 277 /* FIX(1.082392200) */
				36	#define F_1_414 362 /* FIX(1.414213562) */
				37	#define F_1_847 473 /* FIX(1.847759065) */
				38	#define F_2_613 669 /* FIX(2.613125930) */
				39	#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
				40
				41	#define CONST_BITS 8
				42	#define PASS1_BITS 2
				43	#define PRE_MULTIPLY_SCALE_BITS 2
				44	#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
				45
				46
				47	#define DO_IDCT(in) \
				48	{ \
				49	/* Even part */ \
				50	\
				51	tmp10 = vec_add(in##0, in##4); \
				52	tmp11 = vec_sub(in##0, in##4); \
				53	tmp13 = vec_add(in##2, in##6); \
				54	\
				55	tmp12 = vec_sub(in##2, in##6); \
				56	tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
				57	tmp12 = vec_madds(tmp12, pw_F1414, zero); \
				58	tmp12 = vec_sub(tmp12, tmp13); \
				59	\
				60	tmp0 = vec_add(tmp10, tmp13); \
				61	tmp3 = vec_sub(tmp10, tmp13); \
				62	tmp1 = vec_add(tmp11, tmp12); \
				63	tmp2 = vec_sub(tmp11, tmp12); \
				64	\
				65	/* Odd part */ \
				66	\
				67	z13 = vec_add(in##5, in##3); \
				68	z10 = vec_sub(in##5, in##3); \
				69	z10s = vec_sl(z10, pre_multiply_scale_bits); \
				70	z11 = vec_add(in##1, in##7); \
				71	z12s = vec_sub(in##1, in##7); \
				72	z12s = vec_sl(z12s, pre_multiply_scale_bits); \
				73	\
				74	tmp11 = vec_sub(z11, z13); \
				75	tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
				76	tmp11 = vec_madds(tmp11, pw_F1414, zero); \
				77	\
				78	tmp7 = vec_add(z11, z13); \
				79	\
DRC	ff30c63	2014-12-23 02:42:59 +0000	[diff] [blame^]	80	/* To avoid overflow... \
				81	* \
				82	* (Original) \
				83	* tmp12 = -2.613125930 * z10 + z5; \
				84	* \
				85	* (This implementation) \
				86	* tmp12 = (-1.613125930 - 1) * z10 + z5; \
				87	* = -1.613125930 * z10 - z10 + z5; \
				88	*/ \
				89	\
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	90	z5 = vec_add(z10s, z12s); \
				91	z5 = vec_madds(z5, pw_F1847, zero); \
				92	\
				93	tmp10 = vec_madds(z12s, pw_F1082, zero); \
				94	tmp10 = vec_sub(tmp10, z5); \
				95	tmp12 = vec_madds(z10s, pw_MF1613, z5); \
				96	tmp12 = vec_sub(tmp12, z10); \
				97	\
				98	tmp6 = vec_sub(tmp12, tmp7); \
				99	tmp5 = vec_sub(tmp11, tmp6); \
				100	tmp4 = vec_add(tmp10, tmp5); \
				101	\
				102	out0 = vec_add(tmp0, tmp7); \
				103	out1 = vec_add(tmp1, tmp6); \
				104	out2 = vec_add(tmp2, tmp5); \
				105	out3 = vec_sub(tmp3, tmp4); \
				106	out4 = vec_add(tmp3, tmp4); \
				107	out5 = vec_sub(tmp2, tmp5); \
				108	out6 = vec_sub(tmp1, tmp6); \
				109	out7 = vec_sub(tmp0, tmp7); \
				110	}
				111
				112
				113	void
				114	jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
				115	JSAMPARRAY output_buf, JDIMENSION output_col)
				116	{
				117	short dct_table = (short )dct_table_;
				118	__vector short row0, row1, row2, row3, row4, row5, row6, row7,
				119	col0, col1, col2, col3, col4, col5, col6, col7,
				120	quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
				121	tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
				122	z5, z10, z10s, z11, z12s, z13,
				123	out0, out1, out2, out3, out4, out5, out6, out7;
				124	__vector signed char outb;
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	125	int *outptr;
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	126
				127	/* Constants */
				128	__vector short zero = { __8X(0) },
				129	pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
				130	pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
				131	pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
				132	pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
				133	__vector unsigned short
				134	pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
				135	pass1_bits3 = { __8X(PASS1_BITS + 3) };
				136	__vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
				137
DRC	13af139	2014-12-22 01:38:01 +0000	[diff] [blame]	138	/* Pass 1: process columns */
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	139
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	140	col0 = vec_ld(0, coef_block);
				141	col1 = vec_ld(16, coef_block);
				142	col2 = vec_ld(32, coef_block);
				143	col3 = vec_ld(48, coef_block);
				144	col4 = vec_ld(64, coef_block);
				145	col5 = vec_ld(80, coef_block);
				146	col6 = vec_ld(96, coef_block);
				147	col7 = vec_ld(112, coef_block);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	148
				149	tmp1 = vec_or(col1, col2);
				150	tmp2 = vec_or(col3, col4);
				151	tmp1 = vec_or(tmp1, tmp2);
				152	tmp3 = vec_or(col5, col6);
				153	tmp3 = vec_or(tmp3, col7);
				154	tmp1 = vec_or(tmp1, tmp3);
				155
				156	quant0 = (__vector short )&dct_table[0];
				157	col0 = vec_mladd(col0, quant0, zero);
				158
				159	if (vec_all_eq(tmp1, zero)) {
				160	/* AC terms all zero */
				161
				162	row0 = vec_splat(col0, 0);
				163	row1 = vec_splat(col0, 1);
				164	row2 = vec_splat(col0, 2);
				165	row3 = vec_splat(col0, 3);
				166	row4 = vec_splat(col0, 4);
				167	row5 = vec_splat(col0, 5);
				168	row6 = vec_splat(col0, 6);
				169	row7 = vec_splat(col0, 7);
				170
				171	} else {
				172
				173	quant1 = (__vector short )&dct_table[8];
				174	quant2 = (__vector short )&dct_table[16];
				175	quant3 = (__vector short )&dct_table[24];
				176	quant4 = (__vector short )&dct_table[32];
				177	quant5 = (__vector short )&dct_table[40];
				178	quant6 = (__vector short )&dct_table[48];
				179	quant7 = (__vector short )&dct_table[56];
				180
				181	col1 = vec_mladd(col1, quant1, zero);
				182	col2 = vec_mladd(col2, quant2, zero);
				183	col3 = vec_mladd(col3, quant3, zero);
				184	col4 = vec_mladd(col4, quant4, zero);
				185	col5 = vec_mladd(col5, quant5, zero);
				186	col6 = vec_mladd(col6, quant6, zero);
				187	col7 = vec_mladd(col7, quant7, zero);
				188
				189	DO_IDCT(col);
				190
				191	TRANSPOSE(out, row);
				192	}
				193
DRC	13af139	2014-12-22 01:38:01 +0000	[diff] [blame]	194	/* Pass 2: process rows */
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	195
				196	DO_IDCT(row);
				197
				198	out0 = vec_sra(out0, pass1_bits3);
				199	out1 = vec_sra(out1, pass1_bits3);
				200	out2 = vec_sra(out2, pass1_bits3);
				201	out3 = vec_sra(out3, pass1_bits3);
				202	out4 = vec_sra(out4, pass1_bits3);
				203	out5 = vec_sra(out5, pass1_bits3);
				204	out6 = vec_sra(out6, pass1_bits3);
				205	out7 = vec_sra(out7, pass1_bits3);
				206
				207	TRANSPOSE(out, col);
				208
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	209	outb = vec_packs(col0, col0);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	210	outb = vec_add(outb, pb_centerjsamp);
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	211	outptr = (int *)(output_buf[0] + output_col);
				212	vec_ste((__vector int)outb, 0, outptr);
				213	vec_ste((__vector int)outb, 4, outptr);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	214
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	215	outb = vec_packs(col1, col1);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	216	outb = vec_add(outb, pb_centerjsamp);
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	217	outptr = (int *)(output_buf[1] + output_col);
				218	vec_ste((__vector int)outb, 0, outptr);
				219	vec_ste((__vector int)outb, 4, outptr);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	220
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	221	outb = vec_packs(col2, col2);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	222	outb = vec_add(outb, pb_centerjsamp);
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	223	outptr = (int *)(output_buf[2] + output_col);
				224	vec_ste((__vector int)outb, 0, outptr);
				225	vec_ste((__vector int)outb, 4, outptr);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	226
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	227	outb = vec_packs(col3, col3);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	228	outb = vec_add(outb, pb_centerjsamp);
DRC	4545308	2014-12-22 16:04:17 +0000	[diff] [blame]	229	outptr = (int *)(output_buf[3] + output_col);
				230	vec_ste((__vector int)outb, 0, outptr);
				231	vec_ste((__vector int)outb, 4, outptr);
				232
				233	outb = vec_packs(col4, col4);
				234	outb = vec_add(outb, pb_centerjsamp);
				235	outptr = (int *)(output_buf[4] + output_col);
				236	vec_ste((__vector int)outb, 0, outptr);
				237	vec_ste((__vector int)outb, 4, outptr);
				238
				239	outb = vec_packs(col5, col5);
				240	outb = vec_add(outb, pb_centerjsamp);
				241	outptr = (int *)(output_buf[5] + output_col);
				242	vec_ste((__vector int)outb, 0, outptr);
				243	vec_ste((__vector int)outb, 4, outptr);
				244
				245	outb = vec_packs(col6, col6);
				246	outb = vec_add(outb, pb_centerjsamp);
				247	outptr = (int *)(output_buf[6] + output_col);
				248	vec_ste((__vector int)outb, 0, outptr);
				249	vec_ste((__vector int)outb, 4, outptr);
				250
				251	outb = vec_packs(col7, col7);
				252	outb = vec_add(outb, pb_centerjsamp);
				253	outptr = (int *)(output_buf[7] + output_col);
				254	vec_ste((__vector int)outb, 0, outptr);
				255	vec_ste((__vector int)outb, 4, outptr);
DRC	535674b	2014-12-22 01:00:42 +0000	[diff] [blame]	256	}