Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skia

blob: 0b079977eb8de089f2a773cbcbd4244de3949bc8 [file] [log] [blame]

epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	1
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	2	/*
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	3	* Copyright 2009 The Android Open Source Project
				4	*
				5	* Use of this source code is governed by a BSD-style license that can be
				6	* found in the LICENSE file.
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	7	*/
				8
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	9
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	10	#include <emmintrin.h>
				11	#include "SkBitmapProcState_opts_SSE2.h"
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	12	#include "SkPaint.h"
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	13	#include "SkUtils.h"
				14
				15	void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				16	const uint32_t* xy,
				17	int count, uint32_t* colors) {
				18	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	19	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
senorblanco@chromium.org	aa4f0c6	2009-12-01 13:36:19 +0000	[diff] [blame]	20	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				21	SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	22
				23	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	24	size_t rb = s.fBitmap->rowBytes();
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	25	uint32_t XY = *xy++;
				26	unsigned y0 = XY >> 14;
				27	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				28	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				29	unsigned subY = y0 & 0xF;
				30
				31	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				32	__m128i sixteen = _mm_cvtsi32_si128(16);
				33
				34	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				35	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				36
				37	// ( 0, 0, 0, 0, 0, 0, 0, y)
				38	__m128i allY = _mm_cvtsi32_si128(subY);
				39
				40	// ( 0, 0, 0, 0, y, y, y, y)
				41	allY = _mm_shufflelo_epi16(allY, 0);
				42
				43	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				44	__m128i negY = _mm_sub_epi16(sixteen, allY);
				45
				46	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				47	allY = _mm_unpacklo_epi64(allY, negY);
				48
				49	// (16, 16, 16, 16, 16, 16, 16, 16 )
				50	sixteen = _mm_shuffle_epi32(sixteen, 0);
				51
				52	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				53	__m128i zero = _mm_setzero_si128();
				54	do {
				55	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				56	unsigned x0 = XX >> 18;
				57	unsigned x1 = XX & 0x3FFF;
				58
				59	// (0, 0, 0, 0, 0, 0, 0, x)
				60	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	61
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	62	// (0, 0, 0, 0, x, x, x, x)
				63	allX = _mm_shufflelo_epi16(allX, 0);
				64
				65	// (x, x, x, x, x, x, x, x)
				66	allX = _mm_shuffle_epi32(allX, 0);
				67
				68	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				69	__m128i negX = _mm_sub_epi16(sixteen, allX);
				70
				71	// Load 4 samples (pixels).
				72	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				73	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				74	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				75	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				76
				77	// (0, 0, a00, a10)
				78	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				79
				80	// Expand to 16 bits per component.
				81	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				82
				83	// ((a00 * (16-y)), (a10 * y)).
				84	a00a10 = _mm_mullo_epi16(a00a10, allY);
				85
				86	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				87	a00a10 = _mm_mullo_epi16(a00a10, negX);
				88
				89	// (0, 0, a01, a10)
				90	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				91
				92	// Expand to 16 bits per component.
				93	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				94
				95	// (a01 * (16-y)), (a11 * y)
				96	a01a11 = _mm_mullo_epi16(a01a11, allY);
				97
				98	// (a01 * (16-y) * x), (a11 * y * x)
				99	a01a11 = _mm_mullo_epi16(a01a11, allX);
				100
				101	// (a00w00 + a01w01, a10w10 + a11w11)
				102	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				103
				104	// (DC, a00w00 + a01w01)
				105	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				106
				107	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				108	sum = _mm_add_epi16(sum, shifted);
				109
				110	// Divide each 16 bit component by 256.
				111	sum = _mm_srli_epi16(sum, 8);
				112
				113	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				114	sum = _mm_packus_epi16(sum, zero);
				115
				116	// Extract low int and store.
				117	*colors++ = _mm_cvtsi128_si32(sum);
				118	} while (--count > 0);
				119	}
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	120
				121	void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				122	const uint32_t* xy,
				123	int count, uint32_t* colors) {
				124	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	125	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	126	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				127	SkASSERT(s.fAlphaScale < 256);
				128
				129	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	130	size_t rb = s.fBitmap->rowBytes();
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	131	uint32_t XY = *xy++;
				132	unsigned y0 = XY >> 14;
				133	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				134	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				135	unsigned subY = y0 & 0xF;
				136
				137	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				138	__m128i sixteen = _mm_cvtsi32_si128(16);
				139
				140	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				141	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				142
				143	// ( 0, 0, 0, 0, 0, 0, 0, y)
				144	__m128i allY = _mm_cvtsi32_si128(subY);
				145
				146	// ( 0, 0, 0, 0, y, y, y, y)
				147	allY = _mm_shufflelo_epi16(allY, 0);
				148
				149	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				150	__m128i negY = _mm_sub_epi16(sixteen, allY);
				151
				152	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				153	allY = _mm_unpacklo_epi64(allY, negY);
				154
				155	// (16, 16, 16, 16, 16, 16, 16, 16 )
				156	sixteen = _mm_shuffle_epi32(sixteen, 0);
				157
				158	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				159	__m128i zero = _mm_setzero_si128();
				160
				161	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				162	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				163
				164	do {
				165	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				166	unsigned x0 = XX >> 18;
				167	unsigned x1 = XX & 0x3FFF;
				168
				169	// (0, 0, 0, 0, 0, 0, 0, x)
				170	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	171
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	172	// (0, 0, 0, 0, x, x, x, x)
				173	allX = _mm_shufflelo_epi16(allX, 0);
				174
				175	// (x, x, x, x, x, x, x, x)
				176	allX = _mm_shuffle_epi32(allX, 0);
				177
				178	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				179	__m128i negX = _mm_sub_epi16(sixteen, allX);
				180
				181	// Load 4 samples (pixels).
				182	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				183	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				184	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				185	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				186
				187	// (0, 0, a00, a10)
				188	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				189
				190	// Expand to 16 bits per component.
				191	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				192
				193	// ((a00 * (16-y)), (a10 * y)).
				194	a00a10 = _mm_mullo_epi16(a00a10, allY);
				195
				196	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				197	a00a10 = _mm_mullo_epi16(a00a10, negX);
				198
				199	// (0, 0, a01, a10)
				200	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				201
				202	// Expand to 16 bits per component.
				203	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				204
				205	// (a01 * (16-y)), (a11 * y)
				206	a01a11 = _mm_mullo_epi16(a01a11, allY);
				207
				208	// (a01 * (16-y) * x), (a11 * y * x)
				209	a01a11 = _mm_mullo_epi16(a01a11, allX);
				210
				211	// (a00w00 + a01w01, a10w10 + a11w11)
				212	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				213
				214	// (DC, a00w00 + a01w01)
				215	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				216
				217	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				218	sum = _mm_add_epi16(sum, shifted);
				219
				220	// Divide each 16 bit component by 256.
				221	sum = _mm_srli_epi16(sum, 8);
				222
				223	// Multiply by alpha.
				224	sum = _mm_mullo_epi16(sum, alpha);
				225
				226	// Divide each 16 bit component by 256.
				227	sum = _mm_srli_epi16(sum, 8);
				228
				229	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				230	sum = _mm_packus_epi16(sum, zero);
				231
				232	// Extract low int and store.
				233	*colors++ = _mm_cvtsi128_si32(sum);
				234	} while (--count > 0);
				235	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	236
				237	static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
				238	SkFixed one) {
				239	unsigned i = SkClampMax(f >> 16, max);
				240	i = (i << 4) \| ((f >> 12) & 0xF);
				241	return (i << 14) \| SkClampMax((f + one) >> 16, max);
				242	}
				243
				244	/* SSE version of ClampX_ClampY_filter_scale()
				245	* portable version is in core/SkBitmapProcState_matrix.h
				246	*/
				247	void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
				248	int count, int x, int y) {
				249	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				250	SkMatrix::kScale_Mask)) == 0);
				251	SkASSERT(s.fInvKy == 0);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	252
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	253	const unsigned maxX = s.fBitmap->width() - 1;
				254	const SkFixed one = s.fFilterOneX;
				255	const SkFixed dx = s.fInvSx;
				256	SkFixed fx;
				257
				258	SkPoint pt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	259	s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
				260	SkIntToScalar(y) + SK_ScalarHalf, &pt);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	261	const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
				262	const unsigned maxY = s.fBitmap->height() - 1;
				263	// compute our two Y values up front
				264	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
				265	// now initialize fx
				266	fx = SkScalarToFixed(pt.fX) - (one >> 1);
				267
				268	// test if we don't need to apply the tile proc
				269	if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
				270	(unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
				271	if (count >= 4) {
				272	// SSE version of decal_filter_scale
				273	while ((size_t(xy) & 0x0F) != 0) {
				274	SkASSERT((fx >> (16 + 14)) == 0);
				275	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				276	fx += dx;
				277	count--;
				278	}
				279
				280	__m128i wide_1 = _mm_set1_epi32(1);
				281	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				282	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				283	fx + dx, fx);
				284
				285	while (count >= 4) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	286	__m128i wide_out;
				287
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	288	wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
				289	wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	290	_mm_srai_epi32(wide_fx, 16), wide_1));
				291
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	292	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	293
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	294	xy += 4;
				295	fx += dx * 4;
				296	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				297	count -= 4;
				298	} // while count >= 4
				299	} // if count >= 4
				300
				301	while (count-- > 0) {
				302	SkASSERT((fx >> (16 + 14)) == 0);
				303	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				304	fx += dx;
				305	}
				306	} else {
				307	// SSE2 only support 16bit interger max & min, so only process the case
				308	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	309	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	310	// than max 16bit interger in the real world.
				311	if ((count >= 4) && (maxX <= 0xFFFF)) {
				312	while (((size_t)xy & 0x0F) != 0) {
				313	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				314	fx += dx;
				315	count--;
				316	}
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	317
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	318	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				319	fx + dx, fx);
				320	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				321	__m128i wide_one = _mm_set1_epi32(one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	322	__m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	323	__m128i wide_mask = _mm_set1_epi32(0xF);
				324
				325	while (count >= 4) {
				326	__m128i wide_i;
				327	__m128i wide_lo;
				328	__m128i wide_fx1;
				329
				330	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	331	wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	332	_mm_setzero_si128());
				333	wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	334
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	335	// i<<4 \| TILEX_LOW_BITS(fx)
				336	wide_lo = _mm_srli_epi32(wide_fx, 12);
				337	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	338	wide_i = _mm_slli_epi32(wide_i, 4);
				339	wide_i = _mm_or_si128(wide_i, wide_lo);
				340
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	341	// i<<14
				342	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	343
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	344	// SkClampMax(((f+one))>>16,max)
				345	wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	346	wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	347	_mm_setzero_si128());
				348	wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	349
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	350	// final combination
				351	wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	352	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				353
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	354	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	355	fx += dx * 4;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	356	xy += 4;
				357	count -= 4;
				358	} // while count >= 4
				359	} // if count >= 4
				360
				361	while (count-- > 0) {
				362	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				363	fx += dx;
				364	}
				365	}
				366	}
				367
				368	/* SSE version of ClampX_ClampY_nofilter_scale()
				369	* portable version is in core/SkBitmapProcState_matrix.h
				370	*/
				371	void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
				372	uint32_t xy[], int count, int x, int y) {
				373	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				374	SkMatrix::kScale_Mask)) == 0);
				375
				376	// we store y, x, x, x, x, x
				377	const unsigned maxX = s.fBitmap->width() - 1;
				378	SkFixed fx;
				379	SkPoint pt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	380	s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
				381	SkIntToScalar(y) + SK_ScalarHalf, &pt);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	382	fx = SkScalarToFixed(pt.fY);
				383	const unsigned maxY = s.fBitmap->height() - 1;
				384	*xy++ = SkClampMax(fx >> 16, maxY);
				385	fx = SkScalarToFixed(pt.fX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	386
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	387	if (0 == maxX) {
				388	// all of the following X values must be 0
				389	memset(xy, 0, count * sizeof(uint16_t));
				390	return;
				391	}
				392
				393	const SkFixed dx = s.fInvSx;
				394
				395	// test if we don't need to apply the tile proc
				396	if ((unsigned)(fx >> 16) <= maxX &&
				397	(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
				398	// SSE version of decal_nofilter_scale
				399	if (count >= 8) {
				400	while (((size_t)xy & 0x0F) != 0) {
				401	*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
				402	fx += 2 * dx;
				403	count -= 2;
				404	}
				405
				406	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				407	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				408
				409	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				410	fx + dx, fx);
				411	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				412
				413	while (count >= 8) {
				414	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				415	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				416
				417	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				418	wide_out_high);
				419	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	420
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	421	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				422	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				423
				424	xy += 4;
				425	fx += dx * 8;
				426	count -= 8;
				427	}
				428	} // if count >= 8
				429
				430	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				431	while (count-- > 0) {
				432	*xx++ = SkToU16(fx >> 16);
				433	fx += dx;
				434	}
				435	} else {
				436	// SSE2 only support 16bit interger max & min, so only process the case
				437	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	438	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	439	// than max 16bit interger in the real world.
				440	if ((count >= 8) && (maxX <= 0xFFFF)) {
				441	while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org	602f227	2012-03-14 02:04:40 +0000	[diff] [blame]	442	*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
				443	SkClampMax(fx >> 16, maxX));
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	444	fx += 2 * dx;
				445	count -= 2;
				446	}
				447
				448	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				449	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				450
				451	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				452	fx + dx, fx);
				453	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				454	__m128i wide_maxX = _mm_set1_epi32(maxX);
				455
				456	while (count >= 8) {
				457	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				458	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				459
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	460	wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	461	_mm_setzero_si128());
				462	wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
				463	wide_out_high = _mm_max_epi16(wide_out_high,
				464	_mm_setzero_si128());
				465	wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
				466
				467	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				468	wide_out_high);
				469	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
				470
				471	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				472	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				473
				474	xy += 4;
				475	fx += dx * 8;
				476	count -= 8;
				477	}
				478	} // if count >= 8
				479
				480	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				481	while (count-- > 0) {
				482	*xx++ = SkClampMax(fx >> 16, maxX);
				483	fx += dx;
				484	}
				485	}
				486	}
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	487
				488	/* SSE version of ClampX_ClampY_filter_affine()
				489	* portable version is in core/SkBitmapProcState_matrix.h
				490	*/
				491	void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
				492	uint32_t xy[], int count, int x, int y) {
				493	SkPoint srcPt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	494	s.fInvProc(s.fInvMatrix,
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	495	SkIntToScalar(x) + SK_ScalarHalf,
				496	SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	497
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	498	SkFixed oneX = s.fFilterOneX;
				499	SkFixed oneY = s.fFilterOneY;
				500	SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
				501	SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
				502	SkFixed dx = s.fInvSx;
				503	SkFixed dy = s.fInvKy;
				504	unsigned maxX = s.fBitmap->width() - 1;
				505	unsigned maxY = s.fBitmap->height() - 1;
				506
				507	if (count >= 2 && (maxX <= 0xFFFF)) {
				508	SkFixed dx2 = dx + dx;
				509	SkFixed dy2 = dy + dy;
				510
				511	__m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
				512	__m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2);
				513	__m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	514	__m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	515	__m128i wide_mask = _mm_set1_epi32(0xF);
				516
				517	while (count >= 2) {
				518	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	519	__m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	520	_mm_setzero_si128());
				521	wide_i = _mm_min_epi16(wide_i, wide_max);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	522
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	523	// i<<4 \| TILEX_LOW_BITS(f)
				524	__m128i wide_lo = _mm_srli_epi32(wide_f, 12);
				525	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	526	wide_i = _mm_slli_epi32(wide_i, 4);
				527	wide_i = _mm_or_si128(wide_i, wide_lo);
				528
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	529	// i<<14
				530	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	531
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	532	// SkClampMax(((f+one))>>16,max)
				533	__m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	534	wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	535	_mm_setzero_si128());
				536	wide_f1 = _mm_min_epi16(wide_f1, wide_max);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	537
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	538	// final combination
				539	wide_i = _mm_or_si128(wide_i, wide_f1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	540	_mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				541
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	542	wide_f = _mm_add_epi32(wide_f, wide_d2);
				543
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	544	fx += dx2;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	545	fy += dy2;
				546	xy += 4;
				547	count -= 2;
				548	} // while count >= 2
				549	} // if count >= 2
				550
				551	while (count-- > 0) {
				552	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
				553	fy += dy;
				554	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	555	fx += dx;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	556	}
				557	}
				558
				559	/* SSE version of ClampX_ClampY_nofilter_affine()
				560	* portable version is in core/SkBitmapProcState_matrix.h
				561	*/
				562	void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
				563	uint32_t xy[], int count, int x, int y) {
				564	SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
				565	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				566	SkMatrix::kScale_Mask \|
				567	SkMatrix::kAffine_Mask)) == 0);
				568
				569	SkPoint srcPt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	570	s.fInvProc(s.fInvMatrix,
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	571	SkIntToScalar(x) + SK_ScalarHalf,
				572	SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	573
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	574	SkFixed fx = SkScalarToFixed(srcPt.fX);
				575	SkFixed fy = SkScalarToFixed(srcPt.fY);
				576	SkFixed dx = s.fInvSx;
				577	SkFixed dy = s.fInvKy;
				578	int maxX = s.fBitmap->width() - 1;
				579	int maxY = s.fBitmap->height() - 1;
				580
				581	if (count >= 4 && (maxX <= 0xFFFF)) {
				582	while (((size_t)xy & 0x0F) != 0) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	583	*xy++ = (SkClampMax(fy >> 16, maxY) << 16) \|
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	584	SkClampMax(fx >> 16, maxX);
				585	fx += dx;
				586	fy += dy;
				587	count--;
				588	}
				589
				590	SkFixed dx4 = dx * 4;
				591	SkFixed dy4 = dy * 4;
				592
				593	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				594	fx + dx, fx);
				595	__m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
				596	fy + dy, fy);
				597	__m128i wide_dx4 = _mm_set1_epi32(dx4);
				598	__m128i wide_dy4 = _mm_set1_epi32(dy4);
				599
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	600	__m128i wide_maxX = _mm_set1_epi32(maxX);
				601	__m128i wide_maxY = _mm_set1_epi32(maxY);
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	602
				603	while (count >= 4) {
				604	// SkClampMax(fx>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	605	__m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	606	_mm_setzero_si128());
				607	wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	608
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	609	// SkClampMax(fy>>16,maxY)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	610	__m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	611	_mm_setzero_si128());
				612	wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	613
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	614	// final combination
				615	__m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
				616	wide_lo);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	617	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				618
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	619	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				620	wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
				621
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	622	fx += dx4;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	623	fy += dy4;
				624	xy += 4;
				625	count -= 4;
				626	} // while count >= 4
				627	} // if count >= 4
				628
				629	while (count-- > 0) {
				630	*xy++ = (SkClampMax(fy >> 16, maxY) << 16) \|
				631	SkClampMax(fx >> 16, maxX);
				632	fx += dx;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	633	fy += dy;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	634	}
				635	}
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	636
				637	/* SSE version of S32_D16_filter_DX_SSE2
				638	* Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
				639	* It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
				640	*/
				641	void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
				642	const uint32_t* xy,
				643	int count, uint16_t* colors) {
				644	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	645	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	646	SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
				647	SkASSERT(s.fBitmap->isOpaque());
				648
robertphillips@google.com	fc91dc7	2012-07-26 21:18:31 +0000	[diff] [blame]	649	SkPMColor dstColor;
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	650	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	651	size_t rb = s.fBitmap->rowBytes();
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	652	uint32_t XY = *xy++;
				653	unsigned y0 = XY >> 14;
				654	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				655	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				656	unsigned subY = y0 & 0xF;
				657
				658	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				659	__m128i sixteen = _mm_cvtsi32_si128(16);
				660
				661	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				662	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				663
				664	// ( 0, 0, 0, 0, 0, 0, 0, y)
				665	__m128i allY = _mm_cvtsi32_si128(subY);
				666
				667	// ( 0, 0, 0, 0, y, y, y, y)
				668	allY = _mm_shufflelo_epi16(allY, 0);
				669
				670	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				671	__m128i negY = _mm_sub_epi16(sixteen, allY);
				672
				673	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				674	allY = _mm_unpacklo_epi64(allY, negY);
				675
				676	// (16, 16, 16, 16, 16, 16, 16, 16 )
				677	sixteen = _mm_shuffle_epi32(sixteen, 0);
				678
				679	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				680	__m128i zero = _mm_setzero_si128();
				681
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	682	do {
				683	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				684	unsigned x0 = XX >> 18;
				685	unsigned x1 = XX & 0x3FFF;
				686
				687	// (0, 0, 0, 0, 0, 0, 0, x)
				688	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
				689
				690	// (0, 0, 0, 0, x, x, x, x)
				691	allX = _mm_shufflelo_epi16(allX, 0);
				692
				693	// (x, x, x, x, x, x, x, x)
				694	allX = _mm_shuffle_epi32(allX, 0);
				695
				696	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				697	__m128i negX = _mm_sub_epi16(sixteen, allX);
				698
				699	// Load 4 samples (pixels).
				700	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				701	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				702	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				703	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				704
				705	// (0, 0, a00, a10)
				706	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				707
				708	// Expand to 16 bits per component.
				709	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				710
				711	// ((a00 * (16-y)), (a10 * y)).
				712	a00a10 = _mm_mullo_epi16(a00a10, allY);
				713
				714	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				715	a00a10 = _mm_mullo_epi16(a00a10, negX);
				716
				717	// (0, 0, a01, a10)
				718	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				719
				720	// Expand to 16 bits per component.
				721	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				722
				723	// (a01 * (16-y)), (a11 * y)
				724	a01a11 = _mm_mullo_epi16(a01a11, allY);
				725
				726	// (a01 * (16-y) * x), (a11 * y * x)
				727	a01a11 = _mm_mullo_epi16(a01a11, allX);
				728
				729	// (a00w00 + a01w01, a10w10 + a11w11)
				730	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				731
				732	// (DC, a00w00 + a01w01)
				733	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				734
				735	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				736	sum = _mm_add_epi16(sum, shifted);
				737
				738	// Divide each 16 bit component by 256.
				739	sum = _mm_srli_epi16(sum, 8);
				740
				741	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				742	sum = _mm_packus_epi16(sum, zero);
				743
				744	// Extract low int and store.
				745	dstColor = _mm_cvtsi128_si32(sum);
				746
				747	//*colors++ = SkPixel32ToPixel16(dstColor);
				748	// below is much faster than the above. It's tested for Android benchmark--Softweg
				749	__m128i _m_temp1 = _mm_set1_epi32(dstColor);
				750	__m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
				751
				752	unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
				753	unsigned r = (r32 & ((1<<5) -1)) << 11;
				754
				755	_m_temp2 = _mm_srli_epi32(_m_temp2, 7);
				756	unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
				757	unsigned g = (g32 & ((1<<6) -1)) << 5;
				758
				759	_m_temp2 = _mm_srli_epi32(_m_temp2, 9);
				760	unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
				761	unsigned b = (b32 & ((1<<5) -1));
				762
				763	*colors++ = r \| g \| b;
				764
				765	} while (--count > 0);
				766	}