Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skia

blob: 9ddf269ec9e4733e1ee0a79a18df3ebe3e61996c [file] [log] [blame]

senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	1	/*
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	2	* Copyright 2009 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	6	*/
				7
				8	#include <emmintrin.h>
				9	#include "SkBitmapProcState_opts_SSE2.h"
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	10	#include "SkBitmapProcState_utils.h"
Cary Clark	a4083c9	2017-09-15 11:59:23 -0400	[diff] [blame]	11	#include "SkColorData.h"
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	12	#include "SkPaint.h"
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	13	#include "SkUtils.h"
				14
				15	void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				16	const uint32_t* xy,
				17	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	18	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	19	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	20	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	aa4f0c6	2009-12-01 13:36:19 +0000	[diff] [blame]	21	SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	22
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	23	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				24	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	25	uint32_t XY = *xy++;
				26	unsigned y0 = XY >> 14;
				27	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				28	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				29	unsigned subY = y0 & 0xF;
				30
				31	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				32	__m128i sixteen = _mm_cvtsi32_si128(16);
				33
				34	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				35	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				36
				37	// ( 0, 0, 0, 0, 0, 0, 0, y)
				38	__m128i allY = _mm_cvtsi32_si128(subY);
				39
				40	// ( 0, 0, 0, 0, y, y, y, y)
				41	allY = _mm_shufflelo_epi16(allY, 0);
				42
				43	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				44	__m128i negY = _mm_sub_epi16(sixteen, allY);
				45
				46	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				47	allY = _mm_unpacklo_epi64(allY, negY);
				48
				49	// (16, 16, 16, 16, 16, 16, 16, 16 )
				50	sixteen = _mm_shuffle_epi32(sixteen, 0);
				51
				52	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				53	__m128i zero = _mm_setzero_si128();
				54	do {
				55	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				56	unsigned x0 = XX >> 18;
				57	unsigned x1 = XX & 0x3FFF;
				58
				59	// (0, 0, 0, 0, 0, 0, 0, x)
				60	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	61
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	62	// (0, 0, 0, 0, x, x, x, x)
				63	allX = _mm_shufflelo_epi16(allX, 0);
				64
				65	// (x, x, x, x, x, x, x, x)
				66	allX = _mm_shuffle_epi32(allX, 0);
				67
				68	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				69	__m128i negX = _mm_sub_epi16(sixteen, allX);
				70
				71	// Load 4 samples (pixels).
				72	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				73	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				74	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				75	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				76
				77	// (0, 0, a00, a10)
				78	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				79
				80	// Expand to 16 bits per component.
				81	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				82
				83	// ((a00 * (16-y)), (a10 * y)).
				84	a00a10 = _mm_mullo_epi16(a00a10, allY);
				85
				86	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				87	a00a10 = _mm_mullo_epi16(a00a10, negX);
				88
				89	// (0, 0, a01, a10)
				90	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				91
				92	// Expand to 16 bits per component.
				93	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				94
				95	// (a01 * (16-y)), (a11 * y)
				96	a01a11 = _mm_mullo_epi16(a01a11, allY);
				97
				98	// (a01 * (16-y) * x), (a11 * y * x)
				99	a01a11 = _mm_mullo_epi16(a01a11, allX);
				100
				101	// (a00w00 + a01w01, a10w10 + a11w11)
				102	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				103
				104	// (DC, a00w00 + a01w01)
				105	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				106
				107	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				108	sum = _mm_add_epi16(sum, shifted);
				109
				110	// Divide each 16 bit component by 256.
				111	sum = _mm_srli_epi16(sum, 8);
				112
				113	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				114	sum = _mm_packus_epi16(sum, zero);
				115
				116	// Extract low int and store.
				117	*colors++ = _mm_cvtsi128_si32(sum);
				118	} while (--count > 0);
				119	}
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	120
				121	void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				122	const uint32_t* xy,
				123	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	124	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	125	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	126	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	127	SkASSERT(s.fAlphaScale < 256);
				128
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	129	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				130	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	131	uint32_t XY = *xy++;
				132	unsigned y0 = XY >> 14;
				133	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				134	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				135	unsigned subY = y0 & 0xF;
				136
				137	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				138	__m128i sixteen = _mm_cvtsi32_si128(16);
				139
				140	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				141	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				142
				143	// ( 0, 0, 0, 0, 0, 0, 0, y)
				144	__m128i allY = _mm_cvtsi32_si128(subY);
				145
				146	// ( 0, 0, 0, 0, y, y, y, y)
				147	allY = _mm_shufflelo_epi16(allY, 0);
				148
				149	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				150	__m128i negY = _mm_sub_epi16(sixteen, allY);
				151
				152	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				153	allY = _mm_unpacklo_epi64(allY, negY);
				154
				155	// (16, 16, 16, 16, 16, 16, 16, 16 )
				156	sixteen = _mm_shuffle_epi32(sixteen, 0);
				157
				158	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				159	__m128i zero = _mm_setzero_si128();
				160
				161	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				162	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				163
				164	do {
				165	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				166	unsigned x0 = XX >> 18;
				167	unsigned x1 = XX & 0x3FFF;
				168
				169	// (0, 0, 0, 0, 0, 0, 0, x)
				170	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	171
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	172	// (0, 0, 0, 0, x, x, x, x)
				173	allX = _mm_shufflelo_epi16(allX, 0);
				174
				175	// (x, x, x, x, x, x, x, x)
				176	allX = _mm_shuffle_epi32(allX, 0);
				177
				178	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				179	__m128i negX = _mm_sub_epi16(sixteen, allX);
				180
				181	// Load 4 samples (pixels).
				182	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				183	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				184	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				185	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				186
				187	// (0, 0, a00, a10)
				188	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				189
				190	// Expand to 16 bits per component.
				191	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				192
				193	// ((a00 * (16-y)), (a10 * y)).
				194	a00a10 = _mm_mullo_epi16(a00a10, allY);
				195
				196	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				197	a00a10 = _mm_mullo_epi16(a00a10, negX);
				198
				199	// (0, 0, a01, a10)
				200	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				201
				202	// Expand to 16 bits per component.
				203	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				204
				205	// (a01 * (16-y)), (a11 * y)
				206	a01a11 = _mm_mullo_epi16(a01a11, allY);
				207
				208	// (a01 * (16-y) * x), (a11 * y * x)
				209	a01a11 = _mm_mullo_epi16(a01a11, allX);
				210
				211	// (a00w00 + a01w01, a10w10 + a11w11)
				212	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				213
				214	// (DC, a00w00 + a01w01)
				215	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				216
				217	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				218	sum = _mm_add_epi16(sum, shifted);
				219
				220	// Divide each 16 bit component by 256.
				221	sum = _mm_srli_epi16(sum, 8);
				222
				223	// Multiply by alpha.
				224	sum = _mm_mullo_epi16(sum, alpha);
				225
				226	// Divide each 16 bit component by 256.
				227	sum = _mm_srli_epi16(sum, 8);
				228
				229	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				230	sum = _mm_packus_epi16(sum, zero);
				231
				232	// Extract low int and store.
				233	*colors++ = _mm_cvtsi128_si32(sum);
				234	} while (--count > 0);
				235	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	236
Mike Reed	2eab65b	2018-04-17 12:01:10 -0400	[diff] [blame^]	237	// Temporarily go into 64bit so we don't overflow during the add. Since we shift down by 16
				238	// in the end, the result should always fit back in 32bits.
				239	static inline int32_t safe_fixed_add_shift(SkFixed a, SkFixed b) {
				240	int64_t tmp = a;
				241	return SkToS32((tmp + b) >> 16);
				242	}
				243
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	244	static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
				245	SkFixed one) {
				246	unsigned i = SkClampMax(f >> 16, max);
				247	i = (i << 4) \| ((f >> 12) & 0xF);
Mike Reed	2eab65b	2018-04-17 12:01:10 -0400	[diff] [blame^]	248	return (i << 14) \| SkClampMax(safe_fixed_add_shift(f, one), max);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	249	}
				250
				251	/* SSE version of ClampX_ClampY_filter_scale()
				252	* portable version is in core/SkBitmapProcState_matrix.h
				253	*/
				254	void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
				255	int count, int x, int y) {
				256	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				257	SkMatrix::kScale_Mask)) == 0);
				258	SkASSERT(s.fInvKy == 0);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	259
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	260	const unsigned maxX = s.fPixmap.width() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	261	const SkFixed one = s.fFilterOneX;
				262	const SkFixed dx = s.fInvSx;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	263
fmalita	2404f03	2016-02-03 05:44:21 -0800	[diff] [blame]	264	const SkBitmapProcStateAutoMapper mapper(s, x, y);
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	265	const SkFixed fy = mapper.fixedY();
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	266	const unsigned maxY = s.fPixmap.height() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	267	// compute our two Y values up front
				268	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
				269	// now initialize fx
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	270	SkFixed fx = mapper.fixedX();
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	271
				272	// test if we don't need to apply the tile proc
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	273	if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	274	if (count >= 4) {
				275	// SSE version of decal_filter_scale
				276	while ((size_t(xy) & 0x0F) != 0) {
				277	SkASSERT((fx >> (16 + 14)) == 0);
				278	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				279	fx += dx;
				280	count--;
				281	}
				282
				283	__m128i wide_1 = _mm_set1_epi32(1);
				284	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				285	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				286	fx + dx, fx);
				287
				288	while (count >= 4) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	289	__m128i wide_out;
				290
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	291	wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
				292	wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	293	_mm_srai_epi32(wide_fx, 16), wide_1));
				294
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	295	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	296
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	297	xy += 4;
				298	fx += dx * 4;
				299	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				300	count -= 4;
				301	} // while count >= 4
				302	} // if count >= 4
				303
				304	while (count-- > 0) {
				305	SkASSERT((fx >> (16 + 14)) == 0);
				306	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				307	fx += dx;
				308	}
				309	} else {
				310	// SSE2 only support 16bit interger max & min, so only process the case
				311	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	312	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	313	// than max 16bit interger in the real world.
				314	if ((count >= 4) && (maxX <= 0xFFFF)) {
				315	while (((size_t)xy & 0x0F) != 0) {
				316	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				317	fx += dx;
				318	count--;
				319	}
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	320
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	321	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				322	fx + dx, fx);
				323	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				324	__m128i wide_one = _mm_set1_epi32(one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	325	__m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	326	__m128i wide_mask = _mm_set1_epi32(0xF);
				327
				328	while (count >= 4) {
				329	__m128i wide_i;
				330	__m128i wide_lo;
				331	__m128i wide_fx1;
				332
				333	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	334	wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	335	_mm_setzero_si128());
				336	wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	337
Florin Malita	d1c550e	2016-12-19 10:55:41 -0500	[diff] [blame]	338	// i<<4 \| EXTRACT_LOW_BITS(fx)
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	339	wide_lo = _mm_srli_epi32(wide_fx, 12);
				340	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	341	wide_i = _mm_slli_epi32(wide_i, 4);
				342	wide_i = _mm_or_si128(wide_i, wide_lo);
				343
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	344	// i<<14
				345	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	346
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	347	// SkClampMax(((f+one))>>16,max)
				348	wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	349	wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	350	_mm_setzero_si128());
				351	wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	352
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	353	// final combination
				354	wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	355	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				356
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	357	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	358	fx += dx * 4;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	359	xy += 4;
				360	count -= 4;
				361	} // while count >= 4
				362	} // if count >= 4
				363
				364	while (count-- > 0) {
				365	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				366	fx += dx;
				367	}
				368	}
				369	}
				370
				371	/* SSE version of ClampX_ClampY_nofilter_scale()
				372	* portable version is in core/SkBitmapProcState_matrix.h
				373	*/
				374	void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
				375	uint32_t xy[], int count, int x, int y) {
				376	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				377	SkMatrix::kScale_Mask)) == 0);
				378
				379	// we store y, x, x, x, x, x
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	380	const unsigned maxX = s.fPixmap.width() - 1;
fmalita	eb54307	2016-02-02 10:17:24 -0800	[diff] [blame]	381	const SkBitmapProcStateAutoMapper mapper(s, x, y);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	382	const unsigned maxY = s.fPixmap.height() - 1;
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	383	*xy++ = SkClampMax(mapper.intY(), maxY);
				384	SkFixed fx = mapper.fixedX();
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	385
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	386	if (0 == maxX) {
				387	// all of the following X values must be 0
				388	memset(xy, 0, count * sizeof(uint16_t));
				389	return;
				390	}
				391
				392	const SkFixed dx = s.fInvSx;
				393
				394	// test if we don't need to apply the tile proc
				395	if ((unsigned)(fx >> 16) <= maxX &&
				396	(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
				397	// SSE version of decal_nofilter_scale
				398	if (count >= 8) {
				399	while (((size_t)xy & 0x0F) != 0) {
				400	*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
				401	fx += 2 * dx;
				402	count -= 2;
				403	}
				404
				405	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				406	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				407
				408	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				409	fx + dx, fx);
				410	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				411
				412	while (count >= 8) {
				413	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				414	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				415
				416	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				417	wide_out_high);
				418	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	419
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	420	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				421	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				422
				423	xy += 4;
				424	fx += dx * 8;
				425	count -= 8;
				426	}
				427	} // if count >= 8
				428
				429	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				430	while (count-- > 0) {
				431	*xx++ = SkToU16(fx >> 16);
				432	fx += dx;
				433	}
				434	} else {
				435	// SSE2 only support 16bit interger max & min, so only process the case
				436	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	437	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	438	// than max 16bit interger in the real world.
				439	if ((count >= 8) && (maxX <= 0xFFFF)) {
				440	while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org	602f227	2012-03-14 02:04:40 +0000	[diff] [blame]	441	*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
				442	SkClampMax(fx >> 16, maxX));
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	443	fx += 2 * dx;
				444	count -= 2;
				445	}
				446
				447	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				448	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				449
				450	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				451	fx + dx, fx);
				452	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				453	__m128i wide_maxX = _mm_set1_epi32(maxX);
				454
				455	while (count >= 8) {
				456	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				457	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				458
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	459	wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	460	_mm_setzero_si128());
				461	wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
				462	wide_out_high = _mm_max_epi16(wide_out_high,
				463	_mm_setzero_si128());
				464	wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
				465
				466	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				467	wide_out_high);
				468	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
				469
				470	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				471	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				472
				473	xy += 4;
				474	fx += dx * 8;
				475	count -= 8;
				476	}
				477	} // if count >= 8
				478
				479	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				480	while (count-- > 0) {
				481	*xx++ = SkClampMax(fx >> 16, maxX);
				482	fx += dx;
				483	}
				484	}
				485	}