Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skia

blob: d0b93503a93fa88db4d4c057a9c9ef8d6cb90e18 [file] [log] [blame]

senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	1	/*
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	2	* Copyright 2009 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	6	*/
				7
				8	#include <emmintrin.h>
				9	#include "SkBitmapProcState_opts_SSE2.h"
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	10	#include "SkBitmapProcState_utils.h"
Cary Clark	a4083c9	2017-09-15 11:59:23 -0400	[diff] [blame^]	11	#include "SkColorData.h"
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	12	#include "SkPaint.h"
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	13	#include "SkUtils.h"
				14
				15	void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				16	const uint32_t* xy,
				17	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	18	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	19	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	20	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	aa4f0c6	2009-12-01 13:36:19 +0000	[diff] [blame]	21	SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	22
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	23	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				24	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	25	uint32_t XY = *xy++;
				26	unsigned y0 = XY >> 14;
				27	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				28	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				29	unsigned subY = y0 & 0xF;
				30
				31	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				32	__m128i sixteen = _mm_cvtsi32_si128(16);
				33
				34	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				35	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				36
				37	// ( 0, 0, 0, 0, 0, 0, 0, y)
				38	__m128i allY = _mm_cvtsi32_si128(subY);
				39
				40	// ( 0, 0, 0, 0, y, y, y, y)
				41	allY = _mm_shufflelo_epi16(allY, 0);
				42
				43	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				44	__m128i negY = _mm_sub_epi16(sixteen, allY);
				45
				46	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				47	allY = _mm_unpacklo_epi64(allY, negY);
				48
				49	// (16, 16, 16, 16, 16, 16, 16, 16 )
				50	sixteen = _mm_shuffle_epi32(sixteen, 0);
				51
				52	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				53	__m128i zero = _mm_setzero_si128();
				54	do {
				55	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				56	unsigned x0 = XX >> 18;
				57	unsigned x1 = XX & 0x3FFF;
				58
				59	// (0, 0, 0, 0, 0, 0, 0, x)
				60	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	61
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	62	// (0, 0, 0, 0, x, x, x, x)
				63	allX = _mm_shufflelo_epi16(allX, 0);
				64
				65	// (x, x, x, x, x, x, x, x)
				66	allX = _mm_shuffle_epi32(allX, 0);
				67
				68	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				69	__m128i negX = _mm_sub_epi16(sixteen, allX);
				70
				71	// Load 4 samples (pixels).
				72	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				73	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				74	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				75	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				76
				77	// (0, 0, a00, a10)
				78	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				79
				80	// Expand to 16 bits per component.
				81	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				82
				83	// ((a00 * (16-y)), (a10 * y)).
				84	a00a10 = _mm_mullo_epi16(a00a10, allY);
				85
				86	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				87	a00a10 = _mm_mullo_epi16(a00a10, negX);
				88
				89	// (0, 0, a01, a10)
				90	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				91
				92	// Expand to 16 bits per component.
				93	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				94
				95	// (a01 * (16-y)), (a11 * y)
				96	a01a11 = _mm_mullo_epi16(a01a11, allY);
				97
				98	// (a01 * (16-y) * x), (a11 * y * x)
				99	a01a11 = _mm_mullo_epi16(a01a11, allX);
				100
				101	// (a00w00 + a01w01, a10w10 + a11w11)
				102	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				103
				104	// (DC, a00w00 + a01w01)
				105	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				106
				107	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				108	sum = _mm_add_epi16(sum, shifted);
				109
				110	// Divide each 16 bit component by 256.
				111	sum = _mm_srli_epi16(sum, 8);
				112
				113	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				114	sum = _mm_packus_epi16(sum, zero);
				115
				116	// Extract low int and store.
				117	*colors++ = _mm_cvtsi128_si32(sum);
				118	} while (--count > 0);
				119	}
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	120
				121	void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				122	const uint32_t* xy,
				123	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	124	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	125	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	126	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	127	SkASSERT(s.fAlphaScale < 256);
				128
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	129	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				130	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	131	uint32_t XY = *xy++;
				132	unsigned y0 = XY >> 14;
				133	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				134	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				135	unsigned subY = y0 & 0xF;
				136
				137	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				138	__m128i sixteen = _mm_cvtsi32_si128(16);
				139
				140	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				141	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				142
				143	// ( 0, 0, 0, 0, 0, 0, 0, y)
				144	__m128i allY = _mm_cvtsi32_si128(subY);
				145
				146	// ( 0, 0, 0, 0, y, y, y, y)
				147	allY = _mm_shufflelo_epi16(allY, 0);
				148
				149	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				150	__m128i negY = _mm_sub_epi16(sixteen, allY);
				151
				152	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				153	allY = _mm_unpacklo_epi64(allY, negY);
				154
				155	// (16, 16, 16, 16, 16, 16, 16, 16 )
				156	sixteen = _mm_shuffle_epi32(sixteen, 0);
				157
				158	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				159	__m128i zero = _mm_setzero_si128();
				160
				161	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				162	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				163
				164	do {
				165	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				166	unsigned x0 = XX >> 18;
				167	unsigned x1 = XX & 0x3FFF;
				168
				169	// (0, 0, 0, 0, 0, 0, 0, x)
				170	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	171
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	172	// (0, 0, 0, 0, x, x, x, x)
				173	allX = _mm_shufflelo_epi16(allX, 0);
				174
				175	// (x, x, x, x, x, x, x, x)
				176	allX = _mm_shuffle_epi32(allX, 0);
				177
				178	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				179	__m128i negX = _mm_sub_epi16(sixteen, allX);
				180
				181	// Load 4 samples (pixels).
				182	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				183	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				184	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				185	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				186
				187	// (0, 0, a00, a10)
				188	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				189
				190	// Expand to 16 bits per component.
				191	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				192
				193	// ((a00 * (16-y)), (a10 * y)).
				194	a00a10 = _mm_mullo_epi16(a00a10, allY);
				195
				196	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				197	a00a10 = _mm_mullo_epi16(a00a10, negX);
				198
				199	// (0, 0, a01, a10)
				200	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				201
				202	// Expand to 16 bits per component.
				203	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				204
				205	// (a01 * (16-y)), (a11 * y)
				206	a01a11 = _mm_mullo_epi16(a01a11, allY);
				207
				208	// (a01 * (16-y) * x), (a11 * y * x)
				209	a01a11 = _mm_mullo_epi16(a01a11, allX);
				210
				211	// (a00w00 + a01w01, a10w10 + a11w11)
				212	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				213
				214	// (DC, a00w00 + a01w01)
				215	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				216
				217	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				218	sum = _mm_add_epi16(sum, shifted);
				219
				220	// Divide each 16 bit component by 256.
				221	sum = _mm_srli_epi16(sum, 8);
				222
				223	// Multiply by alpha.
				224	sum = _mm_mullo_epi16(sum, alpha);
				225
				226	// Divide each 16 bit component by 256.
				227	sum = _mm_srli_epi16(sum, 8);
				228
				229	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				230	sum = _mm_packus_epi16(sum, zero);
				231
				232	// Extract low int and store.
				233	*colors++ = _mm_cvtsi128_si32(sum);
				234	} while (--count > 0);
				235	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	236
				237	static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
				238	SkFixed one) {
				239	unsigned i = SkClampMax(f >> 16, max);
				240	i = (i << 4) \| ((f >> 12) & 0xF);
				241	return (i << 14) \| SkClampMax((f + one) >> 16, max);
				242	}
				243
				244	/* SSE version of ClampX_ClampY_filter_scale()
				245	* portable version is in core/SkBitmapProcState_matrix.h
				246	*/
				247	void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
				248	int count, int x, int y) {
				249	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				250	SkMatrix::kScale_Mask)) == 0);
				251	SkASSERT(s.fInvKy == 0);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	252
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	253	const unsigned maxX = s.fPixmap.width() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	254	const SkFixed one = s.fFilterOneX;
				255	const SkFixed dx = s.fInvSx;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	256
fmalita	2404f03	2016-02-03 05:44:21 -0800	[diff] [blame]	257	const SkBitmapProcStateAutoMapper mapper(s, x, y);
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	258	const SkFixed fy = mapper.fixedY();
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	259	const unsigned maxY = s.fPixmap.height() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	260	// compute our two Y values up front
				261	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
				262	// now initialize fx
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	263	SkFixed fx = mapper.fixedX();
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	264
				265	// test if we don't need to apply the tile proc
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	266	if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	267	if (count >= 4) {
				268	// SSE version of decal_filter_scale
				269	while ((size_t(xy) & 0x0F) != 0) {
				270	SkASSERT((fx >> (16 + 14)) == 0);
				271	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				272	fx += dx;
				273	count--;
				274	}
				275
				276	__m128i wide_1 = _mm_set1_epi32(1);
				277	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				278	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				279	fx + dx, fx);
				280
				281	while (count >= 4) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	282	__m128i wide_out;
				283
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	284	wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
				285	wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	286	_mm_srai_epi32(wide_fx, 16), wide_1));
				287
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	288	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	289
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	290	xy += 4;
				291	fx += dx * 4;
				292	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				293	count -= 4;
				294	} // while count >= 4
				295	} // if count >= 4
				296
				297	while (count-- > 0) {
				298	SkASSERT((fx >> (16 + 14)) == 0);
				299	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				300	fx += dx;
				301	}
				302	} else {
				303	// SSE2 only support 16bit interger max & min, so only process the case
				304	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	305	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	306	// than max 16bit interger in the real world.
				307	if ((count >= 4) && (maxX <= 0xFFFF)) {
				308	while (((size_t)xy & 0x0F) != 0) {
				309	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				310	fx += dx;
				311	count--;
				312	}
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	313
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	314	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				315	fx + dx, fx);
				316	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				317	__m128i wide_one = _mm_set1_epi32(one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	318	__m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	319	__m128i wide_mask = _mm_set1_epi32(0xF);
				320
				321	while (count >= 4) {
				322	__m128i wide_i;
				323	__m128i wide_lo;
				324	__m128i wide_fx1;
				325
				326	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	327	wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	328	_mm_setzero_si128());
				329	wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	330
Florin Malita	d1c550e	2016-12-19 10:55:41 -0500	[diff] [blame]	331	// i<<4 \| EXTRACT_LOW_BITS(fx)
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	332	wide_lo = _mm_srli_epi32(wide_fx, 12);
				333	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	334	wide_i = _mm_slli_epi32(wide_i, 4);
				335	wide_i = _mm_or_si128(wide_i, wide_lo);
				336
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	337	// i<<14
				338	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	339
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	340	// SkClampMax(((f+one))>>16,max)
				341	wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	342	wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	343	_mm_setzero_si128());
				344	wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	345
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	346	// final combination
				347	wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	348	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				349
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	350	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	351	fx += dx * 4;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	352	xy += 4;
				353	count -= 4;
				354	} // while count >= 4
				355	} // if count >= 4
				356
				357	while (count-- > 0) {
				358	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				359	fx += dx;
				360	}
				361	}
				362	}
				363
				364	/* SSE version of ClampX_ClampY_nofilter_scale()
				365	* portable version is in core/SkBitmapProcState_matrix.h
				366	*/
				367	void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
				368	uint32_t xy[], int count, int x, int y) {
				369	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				370	SkMatrix::kScale_Mask)) == 0);
				371
				372	// we store y, x, x, x, x, x
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	373	const unsigned maxX = s.fPixmap.width() - 1;
fmalita	eb54307	2016-02-02 10:17:24 -0800	[diff] [blame]	374	const SkBitmapProcStateAutoMapper mapper(s, x, y);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	375	const unsigned maxY = s.fPixmap.height() - 1;
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	376	*xy++ = SkClampMax(mapper.intY(), maxY);
				377	SkFixed fx = mapper.fixedX();
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	378
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	379	if (0 == maxX) {
				380	// all of the following X values must be 0
				381	memset(xy, 0, count * sizeof(uint16_t));
				382	return;
				383	}
				384
				385	const SkFixed dx = s.fInvSx;
				386
				387	// test if we don't need to apply the tile proc
				388	if ((unsigned)(fx >> 16) <= maxX &&
				389	(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
				390	// SSE version of decal_nofilter_scale
				391	if (count >= 8) {
				392	while (((size_t)xy & 0x0F) != 0) {
				393	*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
				394	fx += 2 * dx;
				395	count -= 2;
				396	}
				397
				398	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				399	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				400
				401	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				402	fx + dx, fx);
				403	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				404
				405	while (count >= 8) {
				406	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				407	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				408
				409	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				410	wide_out_high);
				411	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	412
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	413	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				414	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				415
				416	xy += 4;
				417	fx += dx * 8;
				418	count -= 8;
				419	}
				420	} // if count >= 8
				421
				422	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				423	while (count-- > 0) {
				424	*xx++ = SkToU16(fx >> 16);
				425	fx += dx;
				426	}
				427	} else {
				428	// SSE2 only support 16bit interger max & min, so only process the case
				429	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	430	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	431	// than max 16bit interger in the real world.
				432	if ((count >= 8) && (maxX <= 0xFFFF)) {
				433	while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org	602f227	2012-03-14 02:04:40 +0000	[diff] [blame]	434	*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
				435	SkClampMax(fx >> 16, maxX));
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	436	fx += 2 * dx;
				437	count -= 2;
				438	}
				439
				440	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				441	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				442
				443	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				444	fx + dx, fx);
				445	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				446	__m128i wide_maxX = _mm_set1_epi32(maxX);
				447
				448	while (count >= 8) {
				449	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				450	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				451
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	452	wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	453	_mm_setzero_si128());
				454	wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
				455	wide_out_high = _mm_max_epi16(wide_out_high,
				456	_mm_setzero_si128());
				457	wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
				458
				459	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				460	wide_out_high);
				461	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
				462
				463	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				464	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				465
				466	xy += 4;
				467	fx += dx * 8;
				468	count -= 8;
				469	}
				470	} // if count >= 8
				471
				472	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				473	while (count-- > 0) {
				474	*xx++ = SkClampMax(fx >> 16, maxX);
				475	fx += dx;
				476	}
				477	}
				478	}