Blame - src/opts/SkBitmapProcState_opts_SSSE3.cpp - platform/external/skia

blob: 1246b953f463122e49ada87ad74a1b16d2a53e7d [file] [log] [blame]

tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	1	/*
				2	* Copyright 2012 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#include <tmmintrin.h> // SSSE3
				9	#include "SkBitmapProcState_opts_SSSE3.h"
				10	#include "SkUtils.h"
				11
				12	// adding anonymous namespace seemed to force gcc to inline directly the
				13	// instantiation, instead of creating the functions
				14	// S32_generic_D32_filter_DX_SSSE3<true> and
				15	// S32_generic_D32_filter_DX_SSSE3<false> which were then called by the
				16	// external functions.
				17	namespace {
				18	// In this file, variations for alpha and non alpha versions are implemented
				19	// with a template, as it makes the code more compact and a bit easier to
				20	// maintain, while making the compiler generate the same exact code as with
				21	// two functions that only differ by a few lines.
				22
				23
				24	// Prepare all necessary constants for a round of processing for two pixel
				25	// pairs.
				26	// @param xy is the location where the xy parameters for four pixels should be
				27	// read from. It is identical in concept with argument two of
				28	// S32_{opaque}_D32_filter_DX methods.
				29	// @param mask_3FFF vector of 32 bit constants containing 3FFF,
				30	// suitable to mask the bottom 14 bits of a XY value.
				31	// @param mask_000F vector of 32 bit constants containing 000F,
				32	// suitable to mask the bottom 4 bits of a XY value.
				33	// @param sixteen_8bit vector of 8 bit components containing the value 16.
				34	// @param mask_dist_select vector of 8 bit components containing the shuffling
				35	// parameters to reorder x[0-3] parameters.
				36	// @param all_x_result vector of 8 bit components that will contain the
				37	// (4x(x3), 4x(x2), 4x(x1), 4x(x0)) upon return.
				38	// @param sixteen_minus_x vector of 8 bit components, containing
				39	// (4x(16 - x3), 4x(16 - x2), 4x(16 - x1), 4x(16 - x0))
				40	inline void PrepareConstantsTwoPixelPairs(const uint32_t* xy,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	41	const __m128i& mask_3FFF,
				42	const __m128i& mask_000F,
				43	const __m128i& sixteen_8bit,
				44	const __m128i& mask_dist_select,
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	45	__m128i* all_x_result,
				46	__m128i* sixteen_minus_x,
				47	int* x0,
				48	int* x1) {
				49	const __m128i xx = _mm_loadu_si128(reinterpret_cast<const __m128i *>(xy));
				50
				51	// 4 delta X
				52	// (x03, x02, x01, x00)
				53	const __m128i x0_wide = _mm_srli_epi32(xx, 18);
				54	// (x13, x12, x11, x10)
				55	const __m128i x1_wide = _mm_and_si128(xx, mask_3FFF);
				56
				57	_mm_storeu_si128(reinterpret_cast<__m128i *>(x0), x0_wide);
				58	_mm_storeu_si128(reinterpret_cast<__m128i *>(x1), x1_wide);
				59
				60	__m128i all_x = _mm_and_si128(_mm_srli_epi32(xx, 14), mask_000F);
				61
				62	// (4x(x3), 4x(x2), 4x(x1), 4x(x0))
				63	all_x = _mm_shuffle_epi8(all_x, mask_dist_select);
				64
				65	*all_x_result = all_x;
				66	// (4x(16-x3), 4x(16-x2), 4x(16-x1), 4x(16-x0))
				67	*sixteen_minus_x = _mm_sub_epi8(sixteen_8bit, all_x);
				68	}
				69
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	70	// Prepare all necessary constants for a round of processing for two pixel
				71	// pairs.
				72	// @param xy is the location where the xy parameters for four pixels should be
				73	// read from. It is identical in concept with argument two of
				74	// S32_{opaque}_D32_filter_DXDY methods.
				75	// @param mask_3FFF vector of 32 bit constants containing 3FFF,
				76	// suitable to mask the bottom 14 bits of a XY value.
				77	// @param mask_000F vector of 32 bit constants containing 000F,
				78	// suitable to mask the bottom 4 bits of a XY value.
				79	// @param sixteen_8bit vector of 8 bit components containing the value 16.
				80	// @param mask_dist_select vector of 8 bit components containing the shuffling
				81	// parameters to reorder x[0-3] parameters.
				82	// @param all_xy_result vector of 8 bit components that will contain the
				83	// (4x(y1), 4x(y0), 4x(x1), 4x(x0)) upon return.
				84	// @param sixteen_minus_x vector of 8 bit components, containing
				85	// (4x(16-y1), 4x(16-y0), 4x(16-x1), 4x(16-x0)).
				86	inline void PrepareConstantsTwoPixelPairsDXDY(const uint32_t* xy,
				87	const __m128i& mask_3FFF,
				88	const __m128i& mask_000F,
				89	const __m128i& sixteen_8bit,
				90	const __m128i& mask_dist_select,
				91	__m128i* all_xy_result,
				92	__m128i* sixteen_minus_xy,
				93	int* xy0, int* xy1) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	94	const __m128i xy_wide =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	95	_mm_loadu_si128(reinterpret_cast<const __m128i *>(xy));
				96
				97	// (x10, y10, x00, y00)
				98	__m128i xy0_wide = _mm_srli_epi32(xy_wide, 18);
				99	// (y10, y00, x10, x00)
				100	xy0_wide = _mm_shuffle_epi32(xy0_wide, _MM_SHUFFLE(2, 0, 3, 1));
				101	// (x11, y11, x01, y01)
				102	__m128i xy1_wide = _mm_and_si128(xy_wide, mask_3FFF);
				103	// (y11, y01, x11, x01)
				104	xy1_wide = _mm_shuffle_epi32(xy1_wide, _MM_SHUFFLE(2, 0, 3, 1));
				105
				106	_mm_storeu_si128(reinterpret_cast<__m128i *>(xy0), xy0_wide);
				107	_mm_storeu_si128(reinterpret_cast<__m128i *>(xy1), xy1_wide);
				108
				109	// (x1, y1, x0, y0)
				110	__m128i all_xy = _mm_and_si128(_mm_srli_epi32(xy_wide, 14), mask_000F);
				111	// (y1, y0, x1, x0)
				112	all_xy = _mm_shuffle_epi32(all_xy, _MM_SHUFFLE(2, 0, 3, 1));
				113	// (4x(y1), 4x(y0), 4x(x1), 4x(x0))
				114	all_xy = _mm_shuffle_epi8(all_xy, mask_dist_select);
				115
				116	*all_xy_result = all_xy;
				117	// (4x(16-y1), 4x(16-y0), 4x(16-x1), 4x(16-x0))
				118	*sixteen_minus_xy = _mm_sub_epi8(sixteen_8bit, all_xy);
				119	}
				120
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	121	// Helper function used when processing one pixel pair.
				122	// @param pixel0..3 are the four input pixels
				123	// @param scale_x vector of 8 bit components to multiply the pixel[0:3]. This
				124	// will contain (4x(x1, 16-x1), 4x(x0, 16-x0))
				125	// or (4x(x3, 16-x3), 4x(x2, 16-x2))
				126	// @return a vector of 16 bit components containing:
				127	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				128	inline __m128i ProcessPixelPairHelper(uint32_t pixel0,
				129	uint32_t pixel1,
				130	uint32_t pixel2,
				131	uint32_t pixel3,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	132	const __m128i& scale_x) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	133	__m128i a0, a1, a2, a3;
				134	// Load 2 pairs of pixels
				135	a0 = _mm_cvtsi32_si128(pixel0);
				136	a1 = _mm_cvtsi32_si128(pixel1);
				137
				138	// Interleave pixels.
				139	// (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				140	a0 = _mm_unpacklo_epi8(a0, a1);
				141
				142	a2 = _mm_cvtsi32_si128(pixel2);
				143	a3 = _mm_cvtsi32_si128(pixel3);
				144	// (0, 0, 0, 0, 0, 0, 0, 0, Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2)
				145	a2 = _mm_unpacklo_epi8(a2, a3);
				146
				147	// two pairs of pixel pairs, interleaved.
				148	// (Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2,
				149	// Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				150	a0 = _mm_unpacklo_epi64(a0, a2);
				151
				152	// multiply and sum to 16 bit components.
				153	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				154	// At that point, we use up a bit less than 12 bits for each 16 bit
				155	// component:
				156	// All components are less than 255. So,
				157	// C0 * (16 - x) + C1 * x <= 255 * (16 - x) + 255 * x = 255 * 16.
				158	return _mm_maddubs_epi16(a0, scale_x);
				159	}
				160
				161	// Scale back the results after multiplications to the [0:255] range, and scale
				162	// by alpha when has_alpha is true.
				163	// Depending on whether one set or two sets of multiplications had been applied,
				164	// the results have to be shifted by four places (dividing by 16), or shifted
				165	// by eight places (dividing by 256), since each multiplication is by a quantity
				166	// in the range [0:16].
				167	template<bool has_alpha, int scale>
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	168	inline __m128i ScaleFourPixels(__m128i* pixels,
				169	const __m128i& alpha) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	170	// Divide each 16 bit component by 16 (or 256 depending on scale).
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	171	pixels = _mm_srli_epi16(pixels, scale);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	172
				173	if (has_alpha) {
				174	// Multiply by alpha.
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	175	pixels = _mm_mullo_epi16(pixels, alpha);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	176
				177	// Divide each 16 bit component by 256.
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	178	pixels = _mm_srli_epi16(pixels, 8);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	179	}
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	180	return *pixels;
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	181	}
				182
				183	// Wrapper to calculate two output pixels from four input pixels. The
				184	// arguments are the same as ProcessPixelPairHelper. Technically, there are
				185	// eight input pixels, but since sub_y == 0, the factors applied to half of the
				186	// pixels is zero (sub_y), and are therefore omitted here to save on some
				187	// processing.
				188	// @param alpha when has_alpha is true, scale all resulting components by this
				189	// value.
				190	// @return a vector of 16 bit components containing:
				191	// ((Aa2 * (16 - x1) + Aa3 * x1) * alpha, ...,
				192	// (Ra0 * (16 - x0) + Ra1 * x0) * alpha) (when has_alpha is true)
				193	// otherwise
				194	// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
				195	// In both cases, the results are renormalized (divided by 16) to match the
				196	// expected formats when storing back the results into memory.
				197	template<bool has_alpha>
				198	inline __m128i ProcessPixelPairZeroSubY(uint32_t pixel0,
				199	uint32_t pixel1,
				200	uint32_t pixel2,
				201	uint32_t pixel3,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	202	const __m128i& scale_x,
				203	const __m128i& alpha) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	204	__m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
				205	scale_x);
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	206	return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	207	}
				208
				209	// Same as ProcessPixelPairZeroSubY, expect processing one output pixel at a
				210	// time instead of two. As in the above function, only two pixels are needed
				211	// to generate a single pixel since sub_y == 0.
				212	// @return same as ProcessPixelPairZeroSubY, except that only the bottom 4
				213	// 16 bit components are set.
				214	template<bool has_alpha>
				215	inline __m128i ProcessOnePixelZeroSubY(uint32_t pixel0,
				216	uint32_t pixel1,
				217	__m128i scale_x,
				218	__m128i alpha) {
				219	__m128i a0 = _mm_cvtsi32_si128(pixel0);
				220	__m128i a1 = _mm_cvtsi32_si128(pixel1);
				221
				222	// Interleave
				223	a0 = _mm_unpacklo_epi8(a0, a1);
				224
				225	// (a0 * (16-x) + a1 * x)
				226	__m128i sum = _mm_maddubs_epi16(a0, scale_x);
				227
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	228	return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	229	}
				230
				231	// Methods when sub_y != 0
				232
				233
				234	// Same as ProcessPixelPairHelper, except that the values are scaled by y.
				235	// @param y vector of 16 bit components containing 'y' values. There are two
				236	// cases in practice, where y will contain the sub_y constant, or will
				237	// contain the 16 - sub_y constant.
				238	// @return vector of 16 bit components containing:
				239	// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , y * (Ra0 * (16 - x0) + Ra1 * x0))
				240	inline __m128i ProcessPixelPair(uint32_t pixel0,
				241	uint32_t pixel1,
				242	uint32_t pixel2,
				243	uint32_t pixel3,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	244	const __m128i& scale_x,
				245	const __m128i& y) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	246	__m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
				247	scale_x);
				248
				249	// first row times 16-y or y depending on whether 'y' represents one or
				250	// the other.
				251	// Values will be up to 255 * 16 * 16 = 65280.
				252	// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... ,
				253	// y * (Ra0 * (16 - x0) + Ra1 * x0))
				254	sum = _mm_mullo_epi16(sum, y);
				255
				256	return sum;
				257	}
				258
				259	// Process two pixel pairs out of eight input pixels.
				260	// In other methods, the distinct pixels are passed one by one, but in this
				261	// case, the rows, and index offsets to the pixels into the row are passed
				262	// to generate the 8 pixels.
				263	// @param row0..1 top and bottom row where to find input pixels.
				264	// @param x0..1 offsets into the row for all eight input pixels.
				265	// @param all_y vector of 16 bit components containing the constant sub_y
				266	// @param neg_y vector of 16 bit components containing the constant 16 - sub_y
				267	// @param alpha vector of 16 bit components containing the alpha value to scale
				268	// the results by, when has_alpha is true.
				269	// @return
				270	// (alpha * ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
				271	// y * (Aa2' * (16-x1) + Aa3' * x1)),
				272	// ...
				273	// alpha * ((16-y) * (Ra0 * (16-x0) + Ra1 * x0) +
				274	// y * (Ra0' * (16-x0) + Ra1' * x0))
				275	// With the factor alpha removed when has_alpha is false.
				276	// The values are scaled back to 16 bit components, but with only the bottom
				277	// 8 bits being set.
				278	template<bool has_alpha>
				279	inline __m128i ProcessTwoPixelPairs(const uint32_t* row0,
				280	const uint32_t* row1,
				281	const int* x0,
				282	const int* x1,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	283	const __m128i& scale_x,
				284	const __m128i& all_y,
				285	const __m128i& neg_y,
				286	const __m128i& alpha) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	287	__m128i sum0 = ProcessPixelPair(
				288	row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
				289	scale_x, neg_y);
				290	__m128i sum1 = ProcessPixelPair(
				291	row1[x0[0]], row1[x1[0]], row1[x0[1]], row1[x1[1]],
				292	scale_x, all_y);
				293
				294	// 2 samples fully summed.
				295	// ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
				296	// y * (Aa2' * (16-x1) + Aa3' * x1),
				297	// ...
				298	// (16-y) * (Ra0 * (16 - x0) + Ra1 * x0)) +
				299	// y * (Ra0' * (16-x0) + Ra1' * x0))
				300	// Each component, again can be at most 256 * 255 = 65280, so no overflow.
				301	sum0 = _mm_add_epi16(sum0, sum1);
				302
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	303	return ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	304	}
				305
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	306	// Similar to ProcessTwoPixelPairs except the pixel indexes.
				307	template<bool has_alpha>
				308	inline __m128i ProcessTwoPixelPairsDXDY(const uint32_t* row00,
				309	const uint32_t* row01,
				310	const uint32_t* row10,
				311	const uint32_t* row11,
				312	const int* xy0,
				313	const int* xy1,
				314	const __m128i& scale_x,
				315	const __m128i& all_y,
				316	const __m128i& neg_y,
				317	const __m128i& alpha) {
				318	// first row
				319	__m128i sum0 = ProcessPixelPair(
				320	row00[xy0[0]], row00[xy1[0]], row10[xy0[1]], row10[xy1[1]],
				321	scale_x, neg_y);
				322	// second row
				323	__m128i sum1 = ProcessPixelPair(
				324	row01[xy0[0]], row01[xy1[0]], row11[xy0[1]], row11[xy1[1]],
				325	scale_x, all_y);
				326
				327	// 2 samples fully summed.
				328	// ((16-y1) * (Aa2 * (16-x1) + Aa3 * x1) +
				329	// y0 * (Aa2' * (16-x1) + Aa3' * x1),
				330	// ...
				331	// (16-y0) * (Ra0 * (16 - x0) + Ra1 * x0)) +
				332	// y0 * (Ra0' * (16-x0) + Ra1' * x0))
				333	// Each component, again can be at most 256 * 255 = 65280, so no overflow.
				334	sum0 = _mm_add_epi16(sum0, sum1);
				335
				336	return ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
				337	}
				338
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	339
				340	// Same as ProcessPixelPair, except that performing the math one output pixel
				341	// at a time. This means that only the bottom four 16 bit components are set.
				342	inline __m128i ProcessOnePixel(uint32_t pixel0, uint32_t pixel1,
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	343	const __m128i& scale_x, const __m128i& y) {
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	344	__m128i a0 = _mm_cvtsi32_si128(pixel0);
				345	__m128i a1 = _mm_cvtsi32_si128(pixel1);
				346
				347	// Interleave
				348	// (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
				349	a0 = _mm_unpacklo_epi8(a0, a1);
				350
				351	// (a0 * (16-x) + a1 * x)
				352	a0 = _mm_maddubs_epi16(a0, scale_x);
				353
				354	// scale row by y
				355	return _mm_mullo_epi16(a0, y);
				356	}
				357
				358	// Notes about the various tricks that are used in this implementation:
				359	// - specialization for sub_y == 0.
				360	// Statistically, 1/16th of the samples will have sub_y == 0. When this
				361	// happens, the math goes from:
				362	// (16 - x)(16 - y)a00 + x(16 - y)a01 + (16 - x)ya10 + xya11
				363	// to:
				364	// (16 - x)a00 + 16x*a01
				365	// much simpler. The simplification makes for an easy boost in performance.
				366	// - calculating 4 output pixels at a time.
				367	// This allows loading the coefficients x0 and x1 and shuffling them to the
				368	// optimum location only once per loop, instead of twice per loop.
				369	// This also allows us to store the four pixels with a single store.
				370	// - Use of 2 special SSSE3 instructions (comparatively to the SSE2 instruction
				371	// version):
				372	// _mm_shuffle_epi8 : this allows us to spread the coefficients x[0-3] loaded
				373	// in 32 bit values to 8 bit values repeated four times.
				374	// _mm_maddubs_epi16 : this allows us to perform multiplications and additions
				375	// in one swoop of 8bit values storing the results in 16 bit values. This
				376	// instruction is actually crucial for the speed of the implementation since
				377	// as one can see in the SSE2 implementation, all inputs have to be used as
				378	// 16 bits because the results are 16 bits. This basically allows us to process
				379	// twice as many pixel components per iteration.
				380	//
				381	// As a result, this method behaves faster than the traditional SSE2. The actual
				382	// boost varies greatly on the underlying architecture.
				383	template<bool has_alpha>
				384	void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				385	const uint32_t* xy,
				386	int count, uint32_t* colors) {
				387	SkASSERT(count > 0 && colors != NULL);
				388	SkASSERT(s.fDoFilter);
				389	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				390	if (has_alpha) {
				391	SkASSERT(s.fAlphaScale < 256);
				392	} else {
				393	SkASSERT(s.fAlphaScale == 256);
				394	}
				395
				396	const uint8_t* src_addr =
				397	static_cast<const uint8_t*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	398	const size_t rb = s.fBitmap->rowBytes();
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	399	const uint32_t XY = *xy++;
				400	const unsigned y0 = XY >> 14;
				401	const uint32_t* row0 =
				402	reinterpret_cast<const uint32_t>(src_addr + (y0 >> 4) rb);
				403	const uint32_t* row1 =
				404	reinterpret_cast<const uint32_t>(src_addr + (XY & 0x3FFF) rb);
				405	const unsigned sub_y = y0 & 0xF;
				406
				407	// vector constants
				408	const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12,
				409	8, 8, 8, 8,
				410	4, 4, 4, 4,
				411	0, 0, 0, 0);
				412	const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF);
				413	const __m128i mask_000F = _mm_set1_epi32(0x000F);
				414	const __m128i sixteen_8bit = _mm_set1_epi8(16);
				415	// (0, 0, 0, 0, 0, 0, 0, 0)
				416	const __m128i zero = _mm_setzero_si128();
				417
tomhudson@google.com	8afae61	2012-08-14 15:03:35 +0000	[diff] [blame]	418	__m128i alpha = _mm_setzero_si128();
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	419	if (has_alpha)
				420	// 8x(alpha)
				421	alpha = _mm_set1_epi16(s.fAlphaScale);
				422
				423	if (sub_y == 0) {
				424	// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
				425	while (count > 3) {
				426	count -= 4;
				427
				428	int x0[4];
				429	int x1[4];
				430	__m128i all_x, sixteen_minus_x;
				431	PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
				432	sixteen_8bit, mask_dist_select,
				433	&all_x, &sixteen_minus_x, x0, x1);
				434	xy += 4;
				435
				436	// First pair of pixel pairs.
				437	// (4x(x1, 16-x1), 4x(x0, 16-x0))
				438	__m128i scale_x;
				439	scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
				440
				441	__m128i sum0 = ProcessPixelPairZeroSubY<has_alpha>(
				442	row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
				443	scale_x, alpha);
				444
				445	// second pair of pixel pairs
				446	// (4x (x3, 16-x3), 4x (16-x2, x2))
				447	scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
				448
				449	__m128i sum1 = ProcessPixelPairZeroSubY<has_alpha>(
				450	row0[x0[2]], row0[x1[2]], row0[x0[3]], row0[x1[3]],
				451	scale_x, alpha);
				452
				453	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				454	sum0 = _mm_packus_epi16(sum0, sum1);
				455
				456	// Extract low int and store.
				457	_mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
				458
				459	colors += 4;
				460	}
				461
				462	// handle remainder
				463	while (count-- > 0) {
				464	uint32_t xx = *xy++; // x0:14 \| 4 \| x1:14
				465	unsigned x0 = xx >> 18;
				466	unsigned x1 = xx & 0x3FFF;
				467
				468	// 16x(x)
				469	const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
				470
				471	// (16x(16-x))
				472	__m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
				473
				474	scale_x = _mm_unpacklo_epi8(scale_x, all_x);
				475
				476	__m128i sum = ProcessOnePixelZeroSubY<has_alpha>(
				477	row0[x0], row0[x1],
				478	scale_x, alpha);
				479
				480	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				481	sum = _mm_packus_epi16(sum, zero);
				482
				483	// Extract low int and store.
				484	*colors++ = _mm_cvtsi128_si32(sum);
				485	}
				486	} else { // more general case, y != 0
				487	// 8x(16)
				488	const __m128i sixteen_16bit = _mm_set1_epi16(16);
				489
				490	// 8x (y)
				491	const __m128i all_y = _mm_set1_epi16(sub_y);
				492
				493	// 8x (16-y)
				494	const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y);
				495
				496	// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
				497	while (count > 3) {
				498	count -= 4;
				499
				500	int x0[4];
				501	int x1[4];
				502	__m128i all_x, sixteen_minus_x;
				503	PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
				504	sixteen_8bit, mask_dist_select,
				505	&all_x, &sixteen_minus_x, x0, x1);
				506	xy += 4;
				507
				508	// First pair of pixel pairs
				509	// (4x(x1, 16-x1), 4x(x0, 16-x0))
				510	__m128i scale_x;
				511	scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
				512
				513	__m128i sum0 = ProcessTwoPixelPairs<has_alpha>(
				514	row0, row1, x0, x1,
				515	scale_x, all_y, neg_y, alpha);
				516
				517	// second pair of pixel pairs
				518	// (4x (x3, 16-x3), 4x (16-x2, x2))
				519	scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
				520
				521	__m128i sum1 = ProcessTwoPixelPairs<has_alpha>(
				522	row0, row1, x0 + 2, x1 + 2,
				523	scale_x, all_y, neg_y, alpha);
				524
				525	// Do the final packing of the two results
				526
				527	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				528	sum0 = _mm_packus_epi16(sum0, sum1);
				529
				530	// Extract low int and store.
				531	_mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
				532
				533	colors += 4;
				534	}
				535
				536	// Left over.
				537	while (count-- > 0) {
				538	const uint32_t xx = *xy++; // x0:14 \| 4 \| x1:14
				539	const unsigned x0 = xx >> 18;
				540	const unsigned x1 = xx & 0x3FFF;
				541
				542	// 16x(x)
				543	const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
				544
				545	// 16x (16-x)
				546	__m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
				547
				548	// (8x (x, 16-x))
				549	scale_x = _mm_unpacklo_epi8(scale_x, all_x);
				550
				551	// first row.
				552	__m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y);
				553	// second row.
				554	__m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y);
				555
				556	// Add both rows for full sample
				557	sum0 = _mm_add_epi16(sum0, sum1);
				558
tomhudson@google.com	4ef14f8	2012-02-14 19:42:39 +0000	[diff] [blame]	559	sum0 = ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	560
				561	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				562	sum0 = _mm_packus_epi16(sum0, zero);
				563
				564	// Extract low int and store.
				565	*colors++ = _mm_cvtsi128_si32(sum0);
				566	}
				567	}
				568	}
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	569
				570	/*
				571	* Similar to S32_generic_D32_filter_DX_SSSE3, we do not need to handle the
				572	* special case suby == 0 as suby is changing in every loop.
				573	*/
				574	template<bool has_alpha>
				575	void S32_generic_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
				576	const uint32_t* xy,
				577	int count, uint32_t* colors) {
				578	SkASSERT(count > 0 && colors != NULL);
				579	SkASSERT(s.fDoFilter);
				580	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				581	if (has_alpha) {
				582	SkASSERT(s.fAlphaScale < 256);
				583	} else {
				584	SkASSERT(s.fAlphaScale == 256);
				585	}
				586
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	587	const uint8_t* src_addr =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	588	static_cast<const uint8_t*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	589	const size_t rb = s.fBitmap->rowBytes();
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	590
				591	// vector constants
				592	const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12,
				593	8, 8, 8, 8,
				594	4, 4, 4, 4,
				595	0, 0, 0, 0);
				596	const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF);
				597	const __m128i mask_000F = _mm_set1_epi32(0x000F);
				598	const __m128i sixteen_8bit = _mm_set1_epi8(16);
				599
				600	__m128i alpha;
				601	if (has_alpha) {
				602	// 8x(alpha)
				603	alpha = _mm_set1_epi16(s.fAlphaScale);
				604	}
				605
				606	// Unroll 2x, interleave bytes, use pmaddubsw (all_x is small)
				607	while (count >= 2) {
				608	int xy0[4];
				609	int xy1[4];
				610	__m128i all_xy, sixteen_minus_xy;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	611	PrepareConstantsTwoPixelPairsDXDY(xy, mask_3FFF, mask_000F,
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	612	sixteen_8bit, mask_dist_select,
				613	&all_xy, &sixteen_minus_xy, xy0, xy1);
				614
				615	// (4x(x1, 16-x1), 4x(x0, 16-x0))
				616	__m128i scale_x = _mm_unpacklo_epi8(sixteen_minus_xy, all_xy);
				617	// (4x(0, y1), 4x(0, y0))
				618	__m128i all_y = _mm_unpackhi_epi8(all_xy, _mm_setzero_si128());
				619	__m128i neg_y = _mm_sub_epi16(_mm_set1_epi16(16), all_y);
				620
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	621	const uint32_t* row00 =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	622	reinterpret_cast<const uint32_t>(src_addr + xy0[2] rb);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	623	const uint32_t* row01 =
				624	reinterpret_cast<const uint32_t>(src_addr + xy1[2] rb);
				625	const uint32_t* row10 =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	626	reinterpret_cast<const uint32_t>(src_addr + xy0[3] rb);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	627	const uint32_t* row11 =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	628	reinterpret_cast<const uint32_t>(src_addr + xy1[3] rb);
				629
				630	__m128i sum0 = ProcessTwoPixelPairsDXDY<has_alpha>(
				631	row00, row01, row10, row11, xy0, xy1,
				632	scale_x, all_y, neg_y, alpha);
				633
				634	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				635	sum0 = _mm_packus_epi16(sum0, _mm_setzero_si128());
				636
				637	// Extract low int and store.
				638	_mm_storel_epi64(reinterpret_cast<__m128i *>(colors), sum0);
				639
				640	xy += 4;
				641	colors += 2;
				642	count -= 2;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	643	}
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	644
				645	// Handle the remainder
				646	while (count-- > 0) {
				647	uint32_t data = *xy++;
				648	unsigned y0 = data >> 14;
				649	unsigned y1 = data & 0x3FFF;
				650	unsigned subY = y0 & 0xF;
				651	y0 >>= 4;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	652
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	653	data = *xy++;
				654	unsigned x0 = data >> 14;
				655	unsigned x1 = data & 0x3FFF;
				656	unsigned subX = x0 & 0xF;
				657	x0 >>= 4;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	658
				659	const uint32_t* row0 =
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	660	reinterpret_cast<const uint32_t>(src_addr + y0 rb);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	661	const uint32_t* row1 =
				662	reinterpret_cast<const uint32_t>(src_addr + y1 rb);
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	663
				664	// 16x(x)
				665	const __m128i all_x = _mm_set1_epi8(subX);
				666
				667	// 16x (16-x)
				668	__m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
				669
				670	// (8x (x, 16-x))
				671	scale_x = _mm_unpacklo_epi8(scale_x, all_x);
				672
				673	// 8x(16)
				674	const __m128i sixteen_16bit = _mm_set1_epi16(16);
				675
				676	// 8x (y)
				677	const __m128i all_y = _mm_set1_epi16(subY);
				678
				679	// 8x (16-y)
				680	const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y);
				681
				682	// first row.
				683	__m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y);
				684	// second row.
				685	__m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y);
				686
				687	// Add both rows for full sample
				688	sum0 = _mm_add_epi16(sum0, sum1);
				689
				690	sum0 = ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
				691
				692	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				693	sum0 = _mm_packus_epi16(sum0, _mm_setzero_si128());
				694
				695	// Extract low int and store.
				696	*colors++ = _mm_cvtsi128_si32(sum0);
				697	}
				698	}
tomhudson@google.com	95ad155	2012-02-14 18:28:54 +0000	[diff] [blame]	699	} // namepace
				700
				701	void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				702	const uint32_t* xy,
				703	int count, uint32_t* colors) {
				704	S32_generic_D32_filter_DX_SSSE3<false>(s, xy, count, colors);
				705	}
				706
				707	void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
				708	const uint32_t* xy,
				709	int count, uint32_t* colors) {
				710	S32_generic_D32_filter_DX_SSSE3<true>(s, xy, count, colors);
				711	}
tomhudson@google.com	ae29b88	2012-03-06 14:59:04 +0000	[diff] [blame]	712
				713	void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
				714	const uint32_t* xy,
				715	int count, uint32_t* colors) {
				716	S32_generic_D32_filter_DXDY_SSSE3<false>(s, xy, count, colors);
				717	}
				718
				719	void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s,
				720	const uint32_t* xy,
				721	int count, uint32_t* colors) {
				722	S32_generic_D32_filter_DXDY_SSSE3<true>(s, xy, count, colors);
				723	}