Blame - src/opts/SkBitmapProcState_opts_SSSE3.cpp - platform/external/skia

blob: 63e59439ac60816e3cb1610e6fd975f128c2ea75 [file] [log] [blame]

tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame^]	1	/*
				2	* Copyright 2012 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#include <tmmintrin.h> // SSSE3
				9	#include "SkBitmapProcState_opts_SSSE3.h"
				10	#include "SkUtils.h"
				11
				12	// adding anonymous namespace seemed to force gcc to inline directly the
				13	// instantiation, instead of creating the functions
				14	// S32_generic_D32_filter_DX_SSSE3<true> and
				15	// S32_generic_D32_filter_DX_SSSE3<false> which were then called by the
				16	// external functions.
				17	namespace {
				18	// In this file, variations for alpha and non alpha versions are implemented
				19	// with a template, as it makes the code more compact and a bit easier to
				20	// maintain, while making the compiler generate the same exact code as with
				21	// two functions that only differ by a few lines.
				22
				23
				24	// Prepare all necessary constants for a round of processing for two pixel
				25	// pairs.
				26	// @param xy is the location where the xy parameters for four pixels should be
				27	// read from. It is identical in concept with argument two of
				28	// S32_{opaque}_D32_filter_DX methods.
				29	// @param mask_3FFF vector of 32 bit constants containing 3FFF,
				30	// suitable to mask the bottom 14 bits of a XY value.
				31	// @param mask_000F vector of 32 bit constants containing 000F,
				32	// suitable to mask the bottom 4 bits of a XY value.
				33	// @param sixteen_8bit vector of 8 bit components containing the value 16.
				34	// @param mask_dist_select vector of 8 bit components containing the shuffling
				35	// parameters to reorder x[0-3] parameters.
				36	// @param all_x_result vector of 8 bit components that will contain the
				37	// (4x(x3), 4x(x2), 4x(x1), 4x(x0)) upon return.
				38	// @param sixteen_minus_x vector of 8 bit components, containing
				39	// (4x(16 - x3), 4x(16 - x2), 4x(16 - x1), 4x(16 - x0))
				40	inline void PrepareConstantsTwoPixelPairs(const uint32_t* xy,
				41	__m128i mask_3FFF,
				42	__m128i mask_000F,
				43	__m128i sixteen_8bit,
				44	__m128i mask_dist_select,
				45	__m128i* all_x_result,
				46	__m128i* sixteen_minus_x,
				47	int* x0,
				48	int* x1) {
				49	const __m128i xx = _mm_loadu_si128(reinterpret_cast<const __m128i *>(xy));
				50
				51	// 4 delta X
				52	// (x03, x02, x01, x00)
				53	const __m128i x0_wide = _mm_srli_epi32(xx, 18);
				54	// (x13, x12, x11, x10)
				55	const __m128i x1_wide = _mm_and_si128(xx, mask_3FFF);
				56
				57	_mm_storeu_si128(reinterpret_cast<__m128i *>(x0), x0_wide);
				58	_mm_storeu_si128(reinterpret_cast<__m128i *>(x1), x1_wide);
				59
				60	__m128i all_x = _mm_and_si128(_mm_srli_epi32(xx, 14), mask_000F);
				61
				62	// (4x(x3), 4x(x2), 4x(x1), 4x(x0))
				63	all_x = _mm_shuffle_epi8(all_x, mask_dist_select);
				64
				65	*all_x_result = all_x;
				66	// (4x(16-x3), 4x(16-x2), 4x(16-x1), 4x(16-x0))
				67	*sixteen_minus_x = _mm_sub_epi8(sixteen_8bit, all_x);
				68	}
				69
				70	// Helper function used when processing one pixel pair.
				71	// @param pixel0..3 are the four input pixels
				72	// @param scale_x vector of 8 bit components to multiply the pixel[0:3]. This
				73	// will contain (4x(x1, 16-x1), 4x(x0, 16-x0))
				74	// or (4x(x3, 16-x3), 4x(x2, 16-x2))
				75	// @return a vector of 16 bit components containing:
				76	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				77	inline __m128i ProcessPixelPairHelper(uint32_t pixel0,
				78	uint32_t pixel1,
				79	uint32_t pixel2,
				80	uint32_t pixel3,
				81	__m128i scale_x) {
				82	__m128i a0, a1, a2, a3;
				83	// Load 2 pairs of pixels
				84	a0 = _mm_cvtsi32_si128(pixel0);
				85	a1 = _mm_cvtsi32_si128(pixel1);
				86
				87	// Interleave pixels.
				88	// (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				89	a0 = _mm_unpacklo_epi8(a0, a1);
				90
				91	a2 = _mm_cvtsi32_si128(pixel2);
				92	a3 = _mm_cvtsi32_si128(pixel3);
				93	// (0, 0, 0, 0, 0, 0, 0, 0, Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2)
				94	a2 = _mm_unpacklo_epi8(a2, a3);
				95
				96	// two pairs of pixel pairs, interleaved.
				97	// (Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2,
				98	// Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				99	a0 = _mm_unpacklo_epi64(a0, a2);
				100
				101	// multiply and sum to 16 bit components.
				102	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				103	// At that point, we use up a bit less than 12 bits for each 16 bit
				104	// component:
				105	// All components are less than 255. So,
				106	// C0 * (16 - x) + C1 * x <= 255 * (16 - x) + 255 * x = 255 * 16.
				107	return _mm_maddubs_epi16(a0, scale_x);
				108	}
				109
				110	// Scale back the results after multiplications to the [0:255] range, and scale
				111	// by alpha when has_alpha is true.
				112	// Depending on whether one set or two sets of multiplications had been applied,
				113	// the results have to be shifted by four places (dividing by 16), or shifted
				114	// by eight places (dividing by 256), since each multiplication is by a quantity
				115	// in the range [0:16].
				116	template<bool has_alpha, int scale>
				117	inline __m128i ScaleFourPixels(__m128i pixels,
				118	__m128i alpha) {
				119	// Divide each 16 bit component by 16 (or 256 depending on scale).
				120	pixels = _mm_srli_epi16(pixels, scale);
				121
				122	if (has_alpha) {
				123	// Multiply by alpha.
				124	pixels = _mm_mullo_epi16(pixels, alpha);
				125
				126	// Divide each 16 bit component by 256.
				127	pixels = _mm_srli_epi16(pixels, 8);
				128	}
				129	return pixels;
				130	}
				131
				132	// Wrapper to calculate two output pixels from four input pixels. The
				133	// arguments are the same as ProcessPixelPairHelper. Technically, there are
				134	// eight input pixels, but since sub_y == 0, the factors applied to half of the
				135	// pixels is zero (sub_y), and are therefore omitted here to save on some
				136	// processing.
				137	// @param alpha when has_alpha is true, scale all resulting components by this
				138	// value.
				139	// @return a vector of 16 bit components containing:
				140	// ((Aa2 * (16 - x1) + Aa3 * x1) * alpha, ...,
				141	// (Ra0 * (16 - x0) + Ra1 * x0) * alpha) (when has_alpha is true)
				142	// otherwise
				143	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				144	// In both cases, the results are renormalized (divided by 16) to match the
				145	// expected formats when storing back the results into memory.
				146	template<bool has_alpha>
				147	inline __m128i ProcessPixelPairZeroSubY(uint32_t pixel0,
				148	uint32_t pixel1,
				149	uint32_t pixel2,
				150	uint32_t pixel3,
				151	__m128i scale_x,
				152	__m128i alpha) {
				153	__m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
				154	scale_x);
				155	return ScaleFourPixels<has_alpha, 4>(sum, alpha);
				156	}
				157
				158	// Same as ProcessPixelPairZeroSubY, expect processing one output pixel at a
				159	// time instead of two. As in the above function, only two pixels are needed
				160	// to generate a single pixel since sub_y == 0.
				161	// @return same as ProcessPixelPairZeroSubY, except that only the bottom 4
				162	// 16 bit components are set.
				163	template<bool has_alpha>
				164	inline __m128i ProcessOnePixelZeroSubY(uint32_t pixel0,
				165	uint32_t pixel1,
				166	__m128i scale_x,
				167	__m128i alpha) {
				168	__m128i a0 = _mm_cvtsi32_si128(pixel0);
				169	__m128i a1 = _mm_cvtsi32_si128(pixel1);
				170
				171	// Interleave
				172	a0 = _mm_unpacklo_epi8(a0, a1);
				173
				174	// (a0 * (16-x) + a1 * x)
				175	__m128i sum = _mm_maddubs_epi16(a0, scale_x);
				176
				177	return ScaleFourPixels<has_alpha, 4>(sum, alpha);
				178	}
				179
				180	// Methods when sub_y != 0
				181
				182
				183	// Same as ProcessPixelPairHelper, except that the values are scaled by y.
				184	// @param y vector of 16 bit components containing 'y' values. There are two
				185	// cases in practice, where y will contain the sub_y constant, or will
				186	// contain the 16 - sub_y constant.
				187	// @return vector of 16 bit components containing:
				188	// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , y * (Ra0 * (16 - x0) + Ra1 * x0))
				189	inline __m128i ProcessPixelPair(uint32_t pixel0,
				190	uint32_t pixel1,
				191	uint32_t pixel2,
				192	uint32_t pixel3,
				193	__m128i scale_x,
				194	__m128i y) {
				195	__m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
				196	scale_x);
				197
				198	// first row times 16-y or y depending on whether 'y' represents one or
				199	// the other.
				200	// Values will be up to 255 * 16 * 16 = 65280.
				201	// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... ,
				202	// y * (Ra0 * (16 - x0) + Ra1 * x0))
				203	sum = _mm_mullo_epi16(sum, y);
				204
				205	return sum;
				206	}
				207
				208	// Process two pixel pairs out of eight input pixels.
				209	// In other methods, the distinct pixels are passed one by one, but in this
				210	// case, the rows, and index offsets to the pixels into the row are passed
				211	// to generate the 8 pixels.
				212	// @param row0..1 top and bottom row where to find input pixels.
				213	// @param x0..1 offsets into the row for all eight input pixels.
				214	// @param all_y vector of 16 bit components containing the constant sub_y
				215	// @param neg_y vector of 16 bit components containing the constant 16 - sub_y
				216	// @param alpha vector of 16 bit components containing the alpha value to scale
				217	// the results by, when has_alpha is true.
				218	// @return
				219	// (alpha * ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
				220	// y * (Aa2' * (16-x1) + Aa3' * x1)),
				221	// ...
				222	// alpha * ((16-y) * (Ra0 * (16-x0) + Ra1 * x0) +
				223	// y * (Ra0' * (16-x0) + Ra1' * x0))
				224	// With the factor alpha removed when has_alpha is false.
				225	// The values are scaled back to 16 bit components, but with only the bottom
				226	// 8 bits being set.
				227	template<bool has_alpha>
				228	inline __m128i ProcessTwoPixelPairs(const uint32_t* row0,
				229	const uint32_t* row1,
				230	const int* x0,
				231	const int* x1,
				232	__m128i scale_x,
				233	__m128i all_y,
				234	__m128i neg_y,
				235	__m128i alpha) {
				236	__m128i sum0 = ProcessPixelPair(
				237	row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
				238	scale_x, neg_y);
				239	__m128i sum1 = ProcessPixelPair(
				240	row1[x0[0]], row1[x1[0]], row1[x0[1]], row1[x1[1]],
				241	scale_x, all_y);
				242
				243	// 2 samples fully summed.
				244	// ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
				245	// y * (Aa2' * (16-x1) + Aa3' * x1),
				246	// ...
				247	// (16-y) * (Ra0 * (16 - x0) + Ra1 * x0)) +
				248	// y * (Ra0' * (16-x0) + Ra1' * x0))
				249	// Each component, again can be at most 256 * 255 = 65280, so no overflow.
				250	sum0 = _mm_add_epi16(sum0, sum1);
				251
				252	return ScaleFourPixels<has_alpha, 8>(sum0, alpha);
				253	}
				254
				255
				256	// Same as ProcessPixelPair, except that performing the math one output pixel
				257	// at a time. This means that only the bottom four 16 bit components are set.
				258	inline __m128i ProcessOnePixel(uint32_t pixel0, uint32_t pixel1,
				259	__m128i scale_x, __m128i y) {
				260	__m128i a0 = _mm_cvtsi32_si128(pixel0);
				261	__m128i a1 = _mm_cvtsi32_si128(pixel1);
				262
				263	// Interleave
				264	// (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				265	a0 = _mm_unpacklo_epi8(a0, a1);
				266
				267	// (a0 * (16-x) + a1 * x)
				268	a0 = _mm_maddubs_epi16(a0, scale_x);
				269
				270	// scale row by y
				271	return _mm_mullo_epi16(a0, y);
				272	}
				273
				274	// Notes about the various tricks that are used in this implementation:
				275	// - specialization for sub_y == 0.
				276	// Statistically, 1/16th of the samples will have sub_y == 0. When this
				277	// happens, the math goes from:
				278	// (16 - x)(16 - y)a00 + x(16 - y)a01 + (16 - x)ya10 + xya11
				279	// to:
				280	// (16 - x)a00 + 16x*a01
				281	// much simpler. The simplification makes for an easy boost in performance.
				282	// - calculating 4 output pixels at a time.
				283	// This allows loading the coefficients x0 and x1 and shuffling them to the
				284	// optimum location only once per loop, instead of twice per loop.
				285	// This also allows us to store the four pixels with a single store.
				286	// - Use of 2 special SSSE3 instructions (comparatively to the SSE2 instruction
				287	// version):
				288	// _mm_shuffle_epi8 : this allows us to spread the coefficients x[0-3] loaded
				289	// in 32 bit values to 8 bit values repeated four times.
				290	// _mm_maddubs_epi16 : this allows us to perform multiplications and additions
				291	// in one swoop of 8bit values storing the results in 16 bit values. This
				292	// instruction is actually crucial for the speed of the implementation since
				293	// as one can see in the SSE2 implementation, all inputs have to be used as
				294	// 16 bits because the results are 16 bits. This basically allows us to process
				295	// twice as many pixel components per iteration.
				296	//
				297	// As a result, this method behaves faster than the traditional SSE2. The actual
				298	// boost varies greatly on the underlying architecture.
				299	template<bool has_alpha>
				300	void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				301	const uint32_t* xy,
				302	int count, uint32_t* colors) {
				303	SkASSERT(count > 0 && colors != NULL);
				304	SkASSERT(s.fDoFilter);
				305	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				306	if (has_alpha) {
				307	SkASSERT(s.fAlphaScale < 256);
				308	} else {
				309	SkASSERT(s.fAlphaScale == 256);
				310	}
				311
				312	const uint8_t* src_addr =
				313	static_cast<const uint8_t*>(s.fBitmap->getPixels());
				314	const unsigned rb = s.fBitmap->rowBytes();
				315	const uint32_t XY = *xy++;
				316	const unsigned y0 = XY >> 14;
				317	const uint32_t* row0 =
				318	reinterpret_cast<const uint32_t>(src_addr + (y0 >> 4) rb);
				319	const uint32_t* row1 =
				320	reinterpret_cast<const uint32_t>(src_addr + (XY & 0x3FFF) rb);
				321	const unsigned sub_y = y0 & 0xF;
				322
				323	// vector constants
				324	const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12,
				325	8, 8, 8, 8,
				326	4, 4, 4, 4,
				327	0, 0, 0, 0);
				328	const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF);
				329	const __m128i mask_000F = _mm_set1_epi32(0x000F);
				330	const __m128i sixteen_8bit = _mm_set1_epi8(16);
				331	// (0, 0, 0, 0, 0, 0, 0, 0)
				332	const __m128i zero = _mm_setzero_si128();
				333
				334	__m128i alpha;
				335	if (has_alpha)
				336	// 8x(alpha)
				337	alpha = _mm_set1_epi16(s.fAlphaScale);
				338
				339	if (sub_y == 0) {
				340	// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
				341	while (count > 3) {
				342	count -= 4;
				343
				344	int x0[4];
				345	int x1[4];
				346	__m128i all_x, sixteen_minus_x;
				347	PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
				348	sixteen_8bit, mask_dist_select,
				349	&all_x, &sixteen_minus_x, x0, x1);
				350	xy += 4;
				351
				352	// First pair of pixel pairs.
				353	// (4x(x1, 16-x1), 4x(x0, 16-x0))
				354	__m128i scale_x;
				355	scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
				356
				357	__m128i sum0 = ProcessPixelPairZeroSubY<has_alpha>(
				358	row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
				359	scale_x, alpha);
				360
				361	// second pair of pixel pairs
				362	// (4x (x3, 16-x3), 4x (16-x2, x2))
				363	scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
				364
				365	__m128i sum1 = ProcessPixelPairZeroSubY<has_alpha>(
				366	row0[x0[2]], row0[x1[2]], row0[x0[3]], row0[x1[3]],
				367	scale_x, alpha);
				368
				369	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				370	sum0 = _mm_packus_epi16(sum0, sum1);
				371
				372	// Extract low int and store.
				373	_mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
				374
				375	colors += 4;
				376	}
				377
				378	// handle remainder
				379	while (count-- > 0) {
				380	uint32_t xx = *xy++; // x0:14 \| 4 \| x1:14
				381	unsigned x0 = xx >> 18;
				382	unsigned x1 = xx & 0x3FFF;
				383
				384	// 16x(x)
				385	const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
				386
				387	// (16x(16-x))
				388	__m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
				389
				390	scale_x = _mm_unpacklo_epi8(scale_x, all_x);
				391
				392	__m128i sum = ProcessOnePixelZeroSubY<has_alpha>(
				393	row0[x0], row0[x1],
				394	scale_x, alpha);
				395
				396	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				397	sum = _mm_packus_epi16(sum, zero);
				398
				399	// Extract low int and store.
				400	*colors++ = _mm_cvtsi128_si32(sum);
				401	}
				402	} else { // more general case, y != 0
				403	// 8x(16)
				404	const __m128i sixteen_16bit = _mm_set1_epi16(16);
				405
				406	// 8x (y)
				407	const __m128i all_y = _mm_set1_epi16(sub_y);
				408
				409	// 8x (16-y)
				410	const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y);
				411
				412	// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
				413	while (count > 3) {
				414	count -= 4;
				415
				416	int x0[4];
				417	int x1[4];
				418	__m128i all_x, sixteen_minus_x;
				419	PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
				420	sixteen_8bit, mask_dist_select,
				421	&all_x, &sixteen_minus_x, x0, x1);
				422	xy += 4;
				423
				424	// First pair of pixel pairs
				425	// (4x(x1, 16-x1), 4x(x0, 16-x0))
				426	__m128i scale_x;
				427	scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
				428
				429	__m128i sum0 = ProcessTwoPixelPairs<has_alpha>(
				430	row0, row1, x0, x1,
				431	scale_x, all_y, neg_y, alpha);
				432
				433	// second pair of pixel pairs
				434	// (4x (x3, 16-x3), 4x (16-x2, x2))
				435	scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
				436
				437	__m128i sum1 = ProcessTwoPixelPairs<has_alpha>(
				438	row0, row1, x0 + 2, x1 + 2,
				439	scale_x, all_y, neg_y, alpha);
				440
				441	// Do the final packing of the two results
				442
				443	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				444	sum0 = _mm_packus_epi16(sum0, sum1);
				445
				446	// Extract low int and store.
				447	_mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
				448
				449	colors += 4;
				450	}
				451
				452	// Left over.
				453	while (count-- > 0) {
				454	const uint32_t xx = *xy++; // x0:14 \| 4 \| x1:14
				455	const unsigned x0 = xx >> 18;
				456	const unsigned x1 = xx & 0x3FFF;
				457
				458	// 16x(x)
				459	const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
				460
				461	// 16x (16-x)
				462	__m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
				463
				464	// (8x (x, 16-x))
				465	scale_x = _mm_unpacklo_epi8(scale_x, all_x);
				466
				467	// first row.
				468	__m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y);
				469	// second row.
				470	__m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y);
				471
				472	// Add both rows for full sample
				473	sum0 = _mm_add_epi16(sum0, sum1);
				474
				475	sum0 = ScaleFourPixels<has_alpha, 8>(sum0, alpha);
				476
				477	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				478	sum0 = _mm_packus_epi16(sum0, zero);
				479
				480	// Extract low int and store.
				481	*colors++ = _mm_cvtsi128_si32(sum0);
				482	}
				483	}
				484	}
				485	} // namepace
				486
				487	void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				488	const uint32_t* xy,
				489	int count, uint32_t* colors) {
				490	S32_generic_D32_filter_DX_SSSE3<false>(s, xy, count, colors);
				491	}
				492
				493	void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				494	const uint32_t* xy,
				495	int count, uint32_t* colors) {
				496	S32_generic_D32_filter_DX_SSSE3<true>(s, xy, count, colors);
				497	}