Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skqp

blob: 1f3bbc1f8f7161ff175cc8ba4c5d2ee0c7a3ae02 [file] [log] [blame]

senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	1	/*
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	2	* Copyright 2009 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	6	*/
				7
				8	#include <emmintrin.h>
				9	#include "SkBitmapProcState_opts_SSE2.h"
commit-bot@chromium.org	4b9b456	2014-04-28 15:07:50 +0000	[diff] [blame]	10	#include "SkColorPriv.h"
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	11	#include "SkPaint.h"
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	12	#include "SkUtils.h"
				13
				14	void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				15	const uint32_t* xy,
				16	int count, uint32_t* colors) {
				17	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	18	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
reed	c77392e	2014-06-02 13:07:26 -0700	[diff] [blame]	19	SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
senorblanco@chromium.org	aa4f0c6	2009-12-01 13:36:19 +0000	[diff] [blame]	20	SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	21
				22	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	23	size_t rb = s.fBitmap->rowBytes();
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	24	uint32_t XY = *xy++;
				25	unsigned y0 = XY >> 14;
				26	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				27	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				28	unsigned subY = y0 & 0xF;
				29
				30	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				31	__m128i sixteen = _mm_cvtsi32_si128(16);
				32
				33	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				34	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				35
				36	// ( 0, 0, 0, 0, 0, 0, 0, y)
				37	__m128i allY = _mm_cvtsi32_si128(subY);
				38
				39	// ( 0, 0, 0, 0, y, y, y, y)
				40	allY = _mm_shufflelo_epi16(allY, 0);
				41
				42	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				43	__m128i negY = _mm_sub_epi16(sixteen, allY);
				44
				45	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				46	allY = _mm_unpacklo_epi64(allY, negY);
				47
				48	// (16, 16, 16, 16, 16, 16, 16, 16 )
				49	sixteen = _mm_shuffle_epi32(sixteen, 0);
				50
				51	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				52	__m128i zero = _mm_setzero_si128();
				53	do {
				54	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				55	unsigned x0 = XX >> 18;
				56	unsigned x1 = XX & 0x3FFF;
				57
				58	// (0, 0, 0, 0, 0, 0, 0, x)
				59	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	60
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	61	// (0, 0, 0, 0, x, x, x, x)
				62	allX = _mm_shufflelo_epi16(allX, 0);
				63
				64	// (x, x, x, x, x, x, x, x)
				65	allX = _mm_shuffle_epi32(allX, 0);
				66
				67	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				68	__m128i negX = _mm_sub_epi16(sixteen, allX);
				69
				70	// Load 4 samples (pixels).
				71	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				72	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				73	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				74	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				75
				76	// (0, 0, a00, a10)
				77	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				78
				79	// Expand to 16 bits per component.
				80	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				81
				82	// ((a00 * (16-y)), (a10 * y)).
				83	a00a10 = _mm_mullo_epi16(a00a10, allY);
				84
				85	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				86	a00a10 = _mm_mullo_epi16(a00a10, negX);
				87
				88	// (0, 0, a01, a10)
				89	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				90
				91	// Expand to 16 bits per component.
				92	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				93
				94	// (a01 * (16-y)), (a11 * y)
				95	a01a11 = _mm_mullo_epi16(a01a11, allY);
				96
				97	// (a01 * (16-y) * x), (a11 * y * x)
				98	a01a11 = _mm_mullo_epi16(a01a11, allX);
				99
				100	// (a00w00 + a01w01, a10w10 + a11w11)
				101	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				102
				103	// (DC, a00w00 + a01w01)
				104	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				105
				106	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				107	sum = _mm_add_epi16(sum, shifted);
				108
				109	// Divide each 16 bit component by 256.
				110	sum = _mm_srli_epi16(sum, 8);
				111
				112	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				113	sum = _mm_packus_epi16(sum, zero);
				114
				115	// Extract low int and store.
				116	*colors++ = _mm_cvtsi128_si32(sum);
				117	} while (--count > 0);
				118	}
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	119
				120	void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				121	const uint32_t* xy,
				122	int count, uint32_t* colors) {
				123	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	124	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
reed	c77392e	2014-06-02 13:07:26 -0700	[diff] [blame]	125	SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	126	SkASSERT(s.fAlphaScale < 256);
				127
				128	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	129	size_t rb = s.fBitmap->rowBytes();
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	130	uint32_t XY = *xy++;
				131	unsigned y0 = XY >> 14;
				132	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				133	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				134	unsigned subY = y0 & 0xF;
				135
				136	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				137	__m128i sixteen = _mm_cvtsi32_si128(16);
				138
				139	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				140	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				141
				142	// ( 0, 0, 0, 0, 0, 0, 0, y)
				143	__m128i allY = _mm_cvtsi32_si128(subY);
				144
				145	// ( 0, 0, 0, 0, y, y, y, y)
				146	allY = _mm_shufflelo_epi16(allY, 0);
				147
				148	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				149	__m128i negY = _mm_sub_epi16(sixteen, allY);
				150
				151	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				152	allY = _mm_unpacklo_epi64(allY, negY);
				153
				154	// (16, 16, 16, 16, 16, 16, 16, 16 )
				155	sixteen = _mm_shuffle_epi32(sixteen, 0);
				156
				157	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				158	__m128i zero = _mm_setzero_si128();
				159
				160	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				161	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				162
				163	do {
				164	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				165	unsigned x0 = XX >> 18;
				166	unsigned x1 = XX & 0x3FFF;
				167
				168	// (0, 0, 0, 0, 0, 0, 0, x)
				169	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	170
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	171	// (0, 0, 0, 0, x, x, x, x)
				172	allX = _mm_shufflelo_epi16(allX, 0);
				173
				174	// (x, x, x, x, x, x, x, x)
				175	allX = _mm_shuffle_epi32(allX, 0);
				176
				177	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				178	__m128i negX = _mm_sub_epi16(sixteen, allX);
				179
				180	// Load 4 samples (pixels).
				181	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				182	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				183	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				184	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				185
				186	// (0, 0, a00, a10)
				187	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				188
				189	// Expand to 16 bits per component.
				190	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				191
				192	// ((a00 * (16-y)), (a10 * y)).
				193	a00a10 = _mm_mullo_epi16(a00a10, allY);
				194
				195	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				196	a00a10 = _mm_mullo_epi16(a00a10, negX);
				197
				198	// (0, 0, a01, a10)
				199	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				200
				201	// Expand to 16 bits per component.
				202	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				203
				204	// (a01 * (16-y)), (a11 * y)
				205	a01a11 = _mm_mullo_epi16(a01a11, allY);
				206
				207	// (a01 * (16-y) * x), (a11 * y * x)
				208	a01a11 = _mm_mullo_epi16(a01a11, allX);
				209
				210	// (a00w00 + a01w01, a10w10 + a11w11)
				211	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				212
				213	// (DC, a00w00 + a01w01)
				214	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				215
				216	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				217	sum = _mm_add_epi16(sum, shifted);
				218
				219	// Divide each 16 bit component by 256.
				220	sum = _mm_srli_epi16(sum, 8);
				221
				222	// Multiply by alpha.
				223	sum = _mm_mullo_epi16(sum, alpha);
				224
				225	// Divide each 16 bit component by 256.
				226	sum = _mm_srli_epi16(sum, 8);
				227
				228	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				229	sum = _mm_packus_epi16(sum, zero);
				230
				231	// Extract low int and store.
				232	*colors++ = _mm_cvtsi128_si32(sum);
				233	} while (--count > 0);
				234	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	235
				236	static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
				237	SkFixed one) {
				238	unsigned i = SkClampMax(f >> 16, max);
				239	i = (i << 4) \| ((f >> 12) & 0xF);
				240	return (i << 14) \| SkClampMax((f + one) >> 16, max);
				241	}
				242
				243	/* SSE version of ClampX_ClampY_filter_scale()
				244	* portable version is in core/SkBitmapProcState_matrix.h
				245	*/
				246	void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
				247	int count, int x, int y) {
				248	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				249	SkMatrix::kScale_Mask)) == 0);
				250	SkASSERT(s.fInvKy == 0);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	251
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	252	const unsigned maxX = s.fBitmap->width() - 1;
				253	const SkFixed one = s.fFilterOneX;
				254	const SkFixed dx = s.fInvSx;
				255	SkFixed fx;
				256
				257	SkPoint pt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	258	s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
				259	SkIntToScalar(y) + SK_ScalarHalf, &pt);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	260	const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
				261	const unsigned maxY = s.fBitmap->height() - 1;
				262	// compute our two Y values up front
				263	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
				264	// now initialize fx
				265	fx = SkScalarToFixed(pt.fX) - (one >> 1);
				266
				267	// test if we don't need to apply the tile proc
				268	if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
				269	(unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
				270	if (count >= 4) {
				271	// SSE version of decal_filter_scale
				272	while ((size_t(xy) & 0x0F) != 0) {
				273	SkASSERT((fx >> (16 + 14)) == 0);
				274	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				275	fx += dx;
				276	count--;
				277	}
				278
				279	__m128i wide_1 = _mm_set1_epi32(1);
				280	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				281	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				282	fx + dx, fx);
				283
				284	while (count >= 4) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	285	__m128i wide_out;
				286
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	287	wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
				288	wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	289	_mm_srai_epi32(wide_fx, 16), wide_1));
				290
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	291	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	292
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	293	xy += 4;
				294	fx += dx * 4;
				295	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				296	count -= 4;
				297	} // while count >= 4
				298	} // if count >= 4
				299
				300	while (count-- > 0) {
				301	SkASSERT((fx >> (16 + 14)) == 0);
				302	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				303	fx += dx;
				304	}
				305	} else {
				306	// SSE2 only support 16bit interger max & min, so only process the case
				307	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	308	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	309	// than max 16bit interger in the real world.
				310	if ((count >= 4) && (maxX <= 0xFFFF)) {
				311	while (((size_t)xy & 0x0F) != 0) {
				312	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				313	fx += dx;
				314	count--;
				315	}
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	316
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	317	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				318	fx + dx, fx);
				319	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				320	__m128i wide_one = _mm_set1_epi32(one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	321	__m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	322	__m128i wide_mask = _mm_set1_epi32(0xF);
				323
				324	while (count >= 4) {
				325	__m128i wide_i;
				326	__m128i wide_lo;
				327	__m128i wide_fx1;
				328
				329	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	330	wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	331	_mm_setzero_si128());
				332	wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	333
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	334	// i<<4 \| TILEX_LOW_BITS(fx)
				335	wide_lo = _mm_srli_epi32(wide_fx, 12);
				336	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	337	wide_i = _mm_slli_epi32(wide_i, 4);
				338	wide_i = _mm_or_si128(wide_i, wide_lo);
				339
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	340	// i<<14
				341	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	342
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	343	// SkClampMax(((f+one))>>16,max)
				344	wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	345	wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	346	_mm_setzero_si128());
				347	wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	348
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	349	// final combination
				350	wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	351	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				352
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	353	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	354	fx += dx * 4;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	355	xy += 4;
				356	count -= 4;
				357	} // while count >= 4
				358	} // if count >= 4
				359
				360	while (count-- > 0) {
				361	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				362	fx += dx;
				363	}
				364	}
				365	}
				366
				367	/* SSE version of ClampX_ClampY_nofilter_scale()
				368	* portable version is in core/SkBitmapProcState_matrix.h
				369	*/
				370	void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
				371	uint32_t xy[], int count, int x, int y) {
				372	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				373	SkMatrix::kScale_Mask)) == 0);
				374
				375	// we store y, x, x, x, x, x
				376	const unsigned maxX = s.fBitmap->width() - 1;
				377	SkFixed fx;
				378	SkPoint pt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	379	s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
				380	SkIntToScalar(y) + SK_ScalarHalf, &pt);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	381	fx = SkScalarToFixed(pt.fY);
				382	const unsigned maxY = s.fBitmap->height() - 1;
				383	*xy++ = SkClampMax(fx >> 16, maxY);
				384	fx = SkScalarToFixed(pt.fX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	385
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	386	if (0 == maxX) {
				387	// all of the following X values must be 0
				388	memset(xy, 0, count * sizeof(uint16_t));
				389	return;
				390	}
				391
				392	const SkFixed dx = s.fInvSx;
				393
				394	// test if we don't need to apply the tile proc
				395	if ((unsigned)(fx >> 16) <= maxX &&
				396	(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
				397	// SSE version of decal_nofilter_scale
				398	if (count >= 8) {
				399	while (((size_t)xy & 0x0F) != 0) {
				400	*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
				401	fx += 2 * dx;
				402	count -= 2;
				403	}
				404
				405	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				406	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				407
				408	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				409	fx + dx, fx);
				410	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				411
				412	while (count >= 8) {
				413	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				414	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				415
				416	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				417	wide_out_high);
				418	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	419
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	420	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				421	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				422
				423	xy += 4;
				424	fx += dx * 8;
				425	count -= 8;
				426	}
				427	} // if count >= 8
				428
				429	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				430	while (count-- > 0) {
				431	*xx++ = SkToU16(fx >> 16);
				432	fx += dx;
				433	}
				434	} else {
				435	// SSE2 only support 16bit interger max & min, so only process the case
				436	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	437	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	438	// than max 16bit interger in the real world.
				439	if ((count >= 8) && (maxX <= 0xFFFF)) {
				440	while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org	602f227	2012-03-14 02:04:40 +0000	[diff] [blame]	441	*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
				442	SkClampMax(fx >> 16, maxX));
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	443	fx += 2 * dx;
				444	count -= 2;
				445	}
				446
				447	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				448	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				449
				450	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				451	fx + dx, fx);
				452	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				453	__m128i wide_maxX = _mm_set1_epi32(maxX);
				454
				455	while (count >= 8) {
				456	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				457	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				458
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	459	wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	460	_mm_setzero_si128());
				461	wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
				462	wide_out_high = _mm_max_epi16(wide_out_high,
				463	_mm_setzero_si128());
				464	wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
				465
				466	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				467	wide_out_high);
				468	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
				469
				470	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				471	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				472
				473	xy += 4;
				474	fx += dx * 8;
				475	count -= 8;
				476	}
				477	} // if count >= 8
				478
				479	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				480	while (count-- > 0) {
				481	*xx++ = SkClampMax(fx >> 16, maxX);
				482	fx += dx;
				483	}
				484	}
				485	}
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	486
				487	/* SSE version of ClampX_ClampY_filter_affine()
				488	* portable version is in core/SkBitmapProcState_matrix.h
				489	*/
				490	void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
				491	uint32_t xy[], int count, int x, int y) {
				492	SkPoint srcPt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	493	s.fInvProc(s.fInvMatrix,
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	494	SkIntToScalar(x) + SK_ScalarHalf,
				495	SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	496
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	497	SkFixed oneX = s.fFilterOneX;
				498	SkFixed oneY = s.fFilterOneY;
				499	SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
				500	SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
				501	SkFixed dx = s.fInvSx;
				502	SkFixed dy = s.fInvKy;
				503	unsigned maxX = s.fBitmap->width() - 1;
				504	unsigned maxY = s.fBitmap->height() - 1;
				505
				506	if (count >= 2 && (maxX <= 0xFFFF)) {
				507	SkFixed dx2 = dx + dx;
				508	SkFixed dy2 = dy + dy;
				509
				510	__m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
				511	__m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2);
				512	__m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	513	__m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	514	__m128i wide_mask = _mm_set1_epi32(0xF);
				515
				516	while (count >= 2) {
				517	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	518	__m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	519	_mm_setzero_si128());
				520	wide_i = _mm_min_epi16(wide_i, wide_max);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	521
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	522	// i<<4 \| TILEX_LOW_BITS(f)
				523	__m128i wide_lo = _mm_srli_epi32(wide_f, 12);
				524	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	525	wide_i = _mm_slli_epi32(wide_i, 4);
				526	wide_i = _mm_or_si128(wide_i, wide_lo);
				527
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	528	// i<<14
				529	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	530
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	531	// SkClampMax(((f+one))>>16,max)
				532	__m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	533	wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	534	_mm_setzero_si128());
				535	wide_f1 = _mm_min_epi16(wide_f1, wide_max);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	536
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	537	// final combination
				538	wide_i = _mm_or_si128(wide_i, wide_f1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	539	_mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				540
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	541	wide_f = _mm_add_epi32(wide_f, wide_d2);
				542
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	543	fx += dx2;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	544	fy += dy2;
				545	xy += 4;
				546	count -= 2;
				547	} // while count >= 2
				548	} // if count >= 2
				549
				550	while (count-- > 0) {
				551	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
				552	fy += dy;
				553	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	554	fx += dx;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	555	}
				556	}
				557
				558	/* SSE version of ClampX_ClampY_nofilter_affine()
				559	* portable version is in core/SkBitmapProcState_matrix.h
				560	*/
				561	void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
				562	uint32_t xy[], int count, int x, int y) {
				563	SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
				564	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				565	SkMatrix::kScale_Mask \|
				566	SkMatrix::kAffine_Mask)) == 0);
				567
				568	SkPoint srcPt;
humper@google.com	9c96d4b	2013-07-14 01:44:59 +0000	[diff] [blame]	569	s.fInvProc(s.fInvMatrix,
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	570	SkIntToScalar(x) + SK_ScalarHalf,
				571	SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	572
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	573	SkFixed fx = SkScalarToFixed(srcPt.fX);
				574	SkFixed fy = SkScalarToFixed(srcPt.fY);
				575	SkFixed dx = s.fInvSx;
				576	SkFixed dy = s.fInvKy;
				577	int maxX = s.fBitmap->width() - 1;
				578	int maxY = s.fBitmap->height() - 1;
				579
				580	if (count >= 4 && (maxX <= 0xFFFF)) {
				581	while (((size_t)xy & 0x0F) != 0) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	582	*xy++ = (SkClampMax(fy >> 16, maxY) << 16) \|
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	583	SkClampMax(fx >> 16, maxX);
				584	fx += dx;
				585	fy += dy;
				586	count--;
				587	}
				588
				589	SkFixed dx4 = dx * 4;
				590	SkFixed dy4 = dy * 4;
				591
				592	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				593	fx + dx, fx);
				594	__m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
				595	fy + dy, fy);
				596	__m128i wide_dx4 = _mm_set1_epi32(dx4);
				597	__m128i wide_dy4 = _mm_set1_epi32(dy4);
				598
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	599	__m128i wide_maxX = _mm_set1_epi32(maxX);
				600	__m128i wide_maxY = _mm_set1_epi32(maxY);
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	601
				602	while (count >= 4) {
				603	// SkClampMax(fx>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	604	__m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	605	_mm_setzero_si128());
				606	wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	607
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	608	// SkClampMax(fy>>16,maxY)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	609	__m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	610	_mm_setzero_si128());
				611	wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	612
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	613	// final combination
				614	__m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
				615	wide_lo);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	616	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				617
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	618	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				619	wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
				620
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	621	fx += dx4;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	622	fy += dy4;
				623	xy += 4;
				624	count -= 4;
				625	} // while count >= 4
				626	} // if count >= 4
				627
				628	while (count-- > 0) {
				629	*xy++ = (SkClampMax(fy >> 16, maxY) << 16) \|
				630	SkClampMax(fx >> 16, maxX);
				631	fx += dx;
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	632	fy += dy;
tomhudson@google.com	5efaf26	2012-02-28 15:41:49 +0000	[diff] [blame]	633	}
				634	}
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	635
				636	/* SSE version of S32_D16_filter_DX_SSE2
				637	* Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
				638	* It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
				639	*/
				640	void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
commit-bot@chromium.org	4b9b456	2014-04-28 15:07:50 +0000	[diff] [blame]	641	const uint32_t* xy,
				642	int count, uint16_t* colors) {
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	643	SkASSERT(count > 0 && colors != NULL);
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	644	SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
reed	c77392e	2014-06-02 13:07:26 -0700	[diff] [blame]	645	SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	646	SkASSERT(s.fBitmap->isOpaque());
				647
robertphillips@google.com	fc91dc7	2012-07-26 21:18:31 +0000	[diff] [blame]	648	SkPMColor dstColor;
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	649	const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
scroggo@google.com	e5f4824	2013-02-25 21:47:41 +0000	[diff] [blame]	650	size_t rb = s.fBitmap->rowBytes();
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	651	uint32_t XY = *xy++;
				652	unsigned y0 = XY >> 14;
				653	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				654	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				655	unsigned subY = y0 & 0xF;
				656
				657	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				658	__m128i sixteen = _mm_cvtsi32_si128(16);
				659
				660	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				661	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				662
				663	// ( 0, 0, 0, 0, 0, 0, 0, y)
				664	__m128i allY = _mm_cvtsi32_si128(subY);
				665
				666	// ( 0, 0, 0, 0, y, y, y, y)
				667	allY = _mm_shufflelo_epi16(allY, 0);
				668
				669	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				670	__m128i negY = _mm_sub_epi16(sixteen, allY);
				671
				672	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				673	allY = _mm_unpacklo_epi64(allY, negY);
				674
				675	// (16, 16, 16, 16, 16, 16, 16, 16 )
				676	sixteen = _mm_shuffle_epi32(sixteen, 0);
				677
				678	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				679	__m128i zero = _mm_setzero_si128();
				680
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	681	do {
				682	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				683	unsigned x0 = XX >> 18;
				684	unsigned x1 = XX & 0x3FFF;
				685
				686	// (0, 0, 0, 0, 0, 0, 0, x)
				687	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
				688
				689	// (0, 0, 0, 0, x, x, x, x)
				690	allX = _mm_shufflelo_epi16(allX, 0);
				691
				692	// (x, x, x, x, x, x, x, x)
				693	allX = _mm_shuffle_epi32(allX, 0);
				694
				695	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				696	__m128i negX = _mm_sub_epi16(sixteen, allX);
				697
				698	// Load 4 samples (pixels).
				699	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				700	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				701	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				702	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				703
				704	// (0, 0, a00, a10)
				705	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				706
				707	// Expand to 16 bits per component.
				708	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				709
				710	// ((a00 * (16-y)), (a10 * y)).
				711	a00a10 = _mm_mullo_epi16(a00a10, allY);
				712
				713	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				714	a00a10 = _mm_mullo_epi16(a00a10, negX);
				715
				716	// (0, 0, a01, a10)
				717	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				718
				719	// Expand to 16 bits per component.
				720	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				721
				722	// (a01 * (16-y)), (a11 * y)
				723	a01a11 = _mm_mullo_epi16(a01a11, allY);
				724
				725	// (a01 * (16-y) * x), (a11 * y * x)
				726	a01a11 = _mm_mullo_epi16(a01a11, allX);
				727
				728	// (a00w00 + a01w01, a10w10 + a11w11)
				729	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				730
				731	// (DC, a00w00 + a01w01)
				732	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				733
				734	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				735	sum = _mm_add_epi16(sum, shifted);
				736
				737	// Divide each 16 bit component by 256.
				738	sum = _mm_srli_epi16(sum, 8);
				739
				740	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				741	sum = _mm_packus_epi16(sum, zero);
				742
				743	// Extract low int and store.
				744	dstColor = _mm_cvtsi128_si32(sum);
				745
commit-bot@chromium.org	4b9b456	2014-04-28 15:07:50 +0000	[diff] [blame]	746	*colors++ = SkPixel32ToPixel16(dstColor);
reed@google.com	7866228	2012-07-24 13:53:23 +0000	[diff] [blame]	747	} while (--count > 0);
				748	}