Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skia

blob: 2484123767068aa4dc13b7ca6762661ea2240008 [file] [log] [blame]

senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	1	/*
epoger@google.com	ec3ed6a	2011-07-28 14:26:00 +0000	[diff] [blame]	2	* Copyright 2009 The Android Open Source Project
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	6	*/
				7
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	8	#include "SkBitmapProcState_opts_SSE2.h"
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	9	#include "SkBitmapProcState_utils.h"
Cary Clark	a4083c9	2017-09-15 11:59:23 -0400	[diff] [blame]	10	#include "SkColorData.h"
reed@google.com	9cfc83c	2013-07-22 17:18:18 +0000	[diff] [blame]	11	#include "SkPaint.h"
Hal Canary	c640d0d	2018-06-13 09:59:02 -0400	[diff] [blame]	12	#include "SkTo.h"
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	13	#include "SkUtils.h"
				14
Hal Canary	c640d0d	2018-06-13 09:59:02 -0400	[diff] [blame]	15	#include <emmintrin.h>
				16
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	17	void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				18	const uint32_t* xy,
				19	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	20	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	21	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	22	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	aa4f0c6	2009-12-01 13:36:19 +0000	[diff] [blame]	23	SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	24
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	25	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				26	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	27	uint32_t XY = *xy++;
				28	unsigned y0 = XY >> 14;
				29	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				30	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				31	unsigned subY = y0 & 0xF;
				32
				33	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				34	__m128i sixteen = _mm_cvtsi32_si128(16);
				35
				36	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				37	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				38
				39	// ( 0, 0, 0, 0, 0, 0, 0, y)
				40	__m128i allY = _mm_cvtsi32_si128(subY);
				41
				42	// ( 0, 0, 0, 0, y, y, y, y)
				43	allY = _mm_shufflelo_epi16(allY, 0);
				44
				45	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				46	__m128i negY = _mm_sub_epi16(sixteen, allY);
				47
				48	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				49	allY = _mm_unpacklo_epi64(allY, negY);
				50
				51	// (16, 16, 16, 16, 16, 16, 16, 16 )
				52	sixteen = _mm_shuffle_epi32(sixteen, 0);
				53
				54	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				55	__m128i zero = _mm_setzero_si128();
				56	do {
				57	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				58	unsigned x0 = XX >> 18;
				59	unsigned x1 = XX & 0x3FFF;
				60
				61	// (0, 0, 0, 0, 0, 0, 0, x)
				62	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	63
senorblanco@chromium.org	dc7de74	2009-11-30 20:00:29 +0000	[diff] [blame]	64	// (0, 0, 0, 0, x, x, x, x)
				65	allX = _mm_shufflelo_epi16(allX, 0);
				66
				67	// (x, x, x, x, x, x, x, x)
				68	allX = _mm_shuffle_epi32(allX, 0);
				69
				70	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				71	__m128i negX = _mm_sub_epi16(sixteen, allX);
				72
				73	// Load 4 samples (pixels).
				74	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				75	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				76	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				77	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				78
				79	// (0, 0, a00, a10)
				80	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				81
				82	// Expand to 16 bits per component.
				83	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				84
				85	// ((a00 * (16-y)), (a10 * y)).
				86	a00a10 = _mm_mullo_epi16(a00a10, allY);
				87
				88	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				89	a00a10 = _mm_mullo_epi16(a00a10, negX);
				90
				91	// (0, 0, a01, a10)
				92	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				93
				94	// Expand to 16 bits per component.
				95	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				96
				97	// (a01 * (16-y)), (a11 * y)
				98	a01a11 = _mm_mullo_epi16(a01a11, allY);
				99
				100	// (a01 * (16-y) * x), (a11 * y * x)
				101	a01a11 = _mm_mullo_epi16(a01a11, allX);
				102
				103	// (a00w00 + a01w01, a10w10 + a11w11)
				104	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				105
				106	// (DC, a00w00 + a01w01)
				107	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				108
				109	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				110	sum = _mm_add_epi16(sum, shifted);
				111
				112	// Divide each 16 bit component by 256.
				113	sum = _mm_srli_epi16(sum, 8);
				114
				115	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				116	sum = _mm_packus_epi16(sum, zero);
				117
				118	// Extract low int and store.
				119	*colors++ = _mm_cvtsi128_si32(sum);
				120	} while (--count > 0);
				121	}
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	122
				123	void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
				124	const uint32_t* xy,
				125	int count, uint32_t* colors) {
halcanary	96fcdcc	2015-08-27 07:41:13 -0700	[diff] [blame]	126	SkASSERT(count > 0 && colors != nullptr);
reed	05a5647	2016-03-02 09:49:02 -0800	[diff] [blame]	127	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	128	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	129	SkASSERT(s.fAlphaScale < 256);
				130
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	131	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				132	size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	133	uint32_t XY = *xy++;
				134	unsigned y0 = XY >> 14;
				135	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				136	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				137	unsigned subY = y0 & 0xF;
				138
				139	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				140	__m128i sixteen = _mm_cvtsi32_si128(16);
				141
				142	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				143	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				144
				145	// ( 0, 0, 0, 0, 0, 0, 0, y)
				146	__m128i allY = _mm_cvtsi32_si128(subY);
				147
				148	// ( 0, 0, 0, 0, y, y, y, y)
				149	allY = _mm_shufflelo_epi16(allY, 0);
				150
				151	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				152	__m128i negY = _mm_sub_epi16(sixteen, allY);
				153
				154	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				155	allY = _mm_unpacklo_epi64(allY, negY);
				156
				157	// (16, 16, 16, 16, 16, 16, 16, 16 )
				158	sixteen = _mm_shuffle_epi32(sixteen, 0);
				159
				160	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				161	__m128i zero = _mm_setzero_si128();
				162
				163	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				164	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				165
				166	do {
				167	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				168	unsigned x0 = XX >> 18;
				169	unsigned x1 = XX & 0x3FFF;
				170
				171	// (0, 0, 0, 0, 0, 0, 0, x)
				172	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	173
senorblanco@chromium.org	f3f0bd7	2009-12-10 22:46:31 +0000	[diff] [blame]	174	// (0, 0, 0, 0, x, x, x, x)
				175	allX = _mm_shufflelo_epi16(allX, 0);
				176
				177	// (x, x, x, x, x, x, x, x)
				178	allX = _mm_shuffle_epi32(allX, 0);
				179
				180	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				181	__m128i negX = _mm_sub_epi16(sixteen, allX);
				182
				183	// Load 4 samples (pixels).
				184	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				185	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				186	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				187	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				188
				189	// (0, 0, a00, a10)
				190	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				191
				192	// Expand to 16 bits per component.
				193	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				194
				195	// ((a00 * (16-y)), (a10 * y)).
				196	a00a10 = _mm_mullo_epi16(a00a10, allY);
				197
				198	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				199	a00a10 = _mm_mullo_epi16(a00a10, negX);
				200
				201	// (0, 0, a01, a10)
				202	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				203
				204	// Expand to 16 bits per component.
				205	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				206
				207	// (a01 * (16-y)), (a11 * y)
				208	a01a11 = _mm_mullo_epi16(a01a11, allY);
				209
				210	// (a01 * (16-y) * x), (a11 * y * x)
				211	a01a11 = _mm_mullo_epi16(a01a11, allX);
				212
				213	// (a00w00 + a01w01, a10w10 + a11w11)
				214	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				215
				216	// (DC, a00w00 + a01w01)
				217	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				218
				219	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				220	sum = _mm_add_epi16(sum, shifted);
				221
				222	// Divide each 16 bit component by 256.
				223	sum = _mm_srli_epi16(sum, 8);
				224
				225	// Multiply by alpha.
				226	sum = _mm_mullo_epi16(sum, alpha);
				227
				228	// Divide each 16 bit component by 256.
				229	sum = _mm_srli_epi16(sum, 8);
				230
				231	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				232	sum = _mm_packus_epi16(sum, zero);
				233
				234	// Extract low int and store.
				235	*colors++ = _mm_cvtsi128_si32(sum);
				236	} while (--count > 0);
				237	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	238
Mike Reed	2eab65b	2018-04-17 12:01:10 -0400	[diff] [blame]	239	// Temporarily go into 64bit so we don't overflow during the add. Since we shift down by 16
				240	// in the end, the result should always fit back in 32bits.
				241	static inline int32_t safe_fixed_add_shift(SkFixed a, SkFixed b) {
				242	int64_t tmp = a;
				243	return SkToS32((tmp + b) >> 16);
				244	}
				245
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	246	static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
				247	SkFixed one) {
				248	unsigned i = SkClampMax(f >> 16, max);
				249	i = (i << 4) \| ((f >> 12) & 0xF);
Mike Reed	2eab65b	2018-04-17 12:01:10 -0400	[diff] [blame]	250	return (i << 14) \| SkClampMax(safe_fixed_add_shift(f, one), max);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	251	}
				252
				253	/* SSE version of ClampX_ClampY_filter_scale()
				254	* portable version is in core/SkBitmapProcState_matrix.h
				255	*/
				256	void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
				257	int count, int x, int y) {
				258	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				259	SkMatrix::kScale_Mask)) == 0);
				260	SkASSERT(s.fInvKy == 0);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	261
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	262	const unsigned maxX = s.fPixmap.width() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	263	const SkFixed one = s.fFilterOneX;
				264	const SkFixed dx = s.fInvSx;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	265
fmalita	2404f03	2016-02-03 05:44:21 -0800	[diff] [blame]	266	const SkBitmapProcStateAutoMapper mapper(s, x, y);
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	267	const SkFixed fy = mapper.fixedY();
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	268	const unsigned maxY = s.fPixmap.height() - 1;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	269	// compute our two Y values up front
				270	*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
				271	// now initialize fx
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	272	SkFixed fx = mapper.fixedX();
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	273
				274	// test if we don't need to apply the tile proc
Florin Malita	9953737	2017-01-04 13:01:55 -0500	[diff] [blame]	275	if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	276	if (count >= 4) {
				277	// SSE version of decal_filter_scale
				278	while ((size_t(xy) & 0x0F) != 0) {
				279	SkASSERT((fx >> (16 + 14)) == 0);
				280	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				281	fx += dx;
				282	count--;
				283	}
				284
				285	__m128i wide_1 = _mm_set1_epi32(1);
				286	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				287	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				288	fx + dx, fx);
				289
				290	while (count >= 4) {
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	291	__m128i wide_out;
				292
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	293	wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
				294	wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	295	_mm_srai_epi32(wide_fx, 16), wide_1));
				296
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	297	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	298
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	299	xy += 4;
				300	fx += dx * 4;
				301	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
				302	count -= 4;
				303	} // while count >= 4
				304	} // if count >= 4
				305
				306	while (count-- > 0) {
				307	SkASSERT((fx >> (16 + 14)) == 0);
				308	*xy++ = (fx >> 12 << 14) \| ((fx >> 16) + 1);
				309	fx += dx;
				310	}
				311	} else {
				312	// SSE2 only support 16bit interger max & min, so only process the case
				313	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	314	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	315	// than max 16bit interger in the real world.
				316	if ((count >= 4) && (maxX <= 0xFFFF)) {
				317	while (((size_t)xy & 0x0F) != 0) {
				318	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				319	fx += dx;
				320	count--;
				321	}
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	322
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	323	__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				324	fx + dx, fx);
				325	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				326	__m128i wide_one = _mm_set1_epi32(one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	327	__m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	328	__m128i wide_mask = _mm_set1_epi32(0xF);
				329
				330	while (count >= 4) {
				331	__m128i wide_i;
				332	__m128i wide_lo;
				333	__m128i wide_fx1;
				334
				335	// i = SkClampMax(f>>16,maxX)
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	336	wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	337	_mm_setzero_si128());
				338	wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	339
Florin Malita	d1c550e	2016-12-19 10:55:41 -0500	[diff] [blame]	340	// i<<4 \| EXTRACT_LOW_BITS(fx)
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	341	wide_lo = _mm_srli_epi32(wide_fx, 12);
				342	wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	343	wide_i = _mm_slli_epi32(wide_i, 4);
				344	wide_i = _mm_or_si128(wide_i, wide_lo);
				345
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	346	// i<<14
				347	wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	348
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	349	// SkClampMax(((f+one))>>16,max)
				350	wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	351	wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	352	_mm_setzero_si128());
				353	wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	354
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	355	// final combination
				356	wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	357	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
				358
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	359	wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	360	fx += dx * 4;
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	361	xy += 4;
				362	count -= 4;
				363	} // while count >= 4
				364	} // if count >= 4
				365
Mike Reed	010ce2b	2018-05-09 13:53:59 -0400	[diff] [blame]	366	/*
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	367	while (count-- > 0) {
				368	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				369	fx += dx;
				370	}
Mike Reed	010ce2b	2018-05-09 13:53:59 -0400	[diff] [blame]	371	We'd like to write this as above, but that form allows fx to get 1-iteration too big/small
				372	when count is 0, and this can trigger a UBSAN error, even though we won't in fact use that
				373	last (undefined) value for fx.
				374
				375	Here is an alternative that should always be efficient, but seems much harder to read:
				376
				377	if (count > 0) {
				378	for (;;) {
				379	*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
				380	if (--count == 0) break;
				381	fx += dx;
				382	}
				383	}
				384
				385	For now, we'll try this variant: more compact than the if/for version, and we hope the
				386	compiler will get rid of the integer multiply.
				387	*/
				388	for (int i = 0; i < count; ++i) {
				389	xy++ = ClampX_ClampY_pack_filter(fx + idx, maxX, one);
				390	}
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	391	}
				392	}
				393
				394	/* SSE version of ClampX_ClampY_nofilter_scale()
				395	* portable version is in core/SkBitmapProcState_matrix.h
				396	*/
				397	void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
				398	uint32_t xy[], int count, int x, int y) {
				399	SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|
				400	SkMatrix::kScale_Mask)) == 0);
				401
				402	// we store y, x, x, x, x, x
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	403	const unsigned maxX = s.fPixmap.width() - 1;
fmalita	eb54307	2016-02-02 10:17:24 -0800	[diff] [blame]	404	const SkBitmapProcStateAutoMapper mapper(s, x, y);
reed	ad7ae6c	2015-06-04 14:12:25 -0700	[diff] [blame]	405	const unsigned maxY = s.fPixmap.height() - 1;
fmalita	be5cfa9	2016-02-03 10:21:33 -0800	[diff] [blame]	406	*xy++ = SkClampMax(mapper.intY(), maxY);
				407	SkFixed fx = mapper.fixedX();
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	408
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	409	if (0 == maxX) {
				410	// all of the following X values must be 0
				411	memset(xy, 0, count * sizeof(uint16_t));
				412	return;
				413	}
				414
				415	const SkFixed dx = s.fInvSx;
				416
				417	// test if we don't need to apply the tile proc
				418	if ((unsigned)(fx >> 16) <= maxX &&
				419	(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
				420	// SSE version of decal_nofilter_scale
				421	if (count >= 8) {
				422	while (((size_t)xy & 0x0F) != 0) {
				423	*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
				424	fx += 2 * dx;
				425	count -= 2;
				426	}
				427
				428	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				429	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				430
				431	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				432	fx + dx, fx);
				433	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				434
				435	while (count >= 8) {
				436	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				437	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				438
				439	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				440	wide_out_high);
				441	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	442
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	443	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				444	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				445
				446	xy += 4;
				447	fx += dx * 8;
				448	count -= 8;
				449	}
				450	} // if count >= 8
				451
				452	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				453	while (count-- > 0) {
				454	*xx++ = SkToU16(fx >> 16);
				455	fx += dx;
				456	}
				457	} else {
				458	// SSE2 only support 16bit interger max & min, so only process the case
				459	// maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	460	// height, there should be rare bitmap whose height will be greater
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	461	// than max 16bit interger in the real world.
				462	if ((count >= 8) && (maxX <= 0xFFFF)) {
				463	while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org	602f227	2012-03-14 02:04:40 +0000	[diff] [blame]	464	*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
				465	SkClampMax(fx >> 16, maxX));
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	466	fx += 2 * dx;
				467	count -= 2;
				468	}
				469
				470	__m128i wide_dx4 = _mm_set1_epi32(dx * 4);
				471	__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
				472
				473	__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
				474	fx + dx, fx);
				475	__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
				476	__m128i wide_maxX = _mm_set1_epi32(maxX);
				477
				478	while (count >= 8) {
				479	__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
				480	__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
				481
rmistry@google.com	fbfcd56	2012-08-23 18:09:54 +0000	[diff] [blame]	482	wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com	06a7313	2012-02-22 18:30:43 +0000	[diff] [blame]	483	_mm_setzero_si128());
				484	wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
				485	wide_out_high = _mm_max_epi16(wide_out_high,
				486	_mm_setzero_si128());
				487	wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
				488
				489	__m128i wide_result = _mm_packs_epi32(wide_out_low,
				490	wide_out_high);
				491	_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
				492
				493	wide_low = _mm_add_epi32(wide_low, wide_dx8);
				494	wide_high = _mm_add_epi32(wide_high, wide_dx8);
				495
				496	xy += 4;
				497	fx += dx * 8;
				498	count -= 8;
				499	}
				500	} // if count >= 8
				501
				502	uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
				503	while (count-- > 0) {
				504	*xx++ = SkClampMax(fx >> 16, maxX);
				505	fx += dx;
				506	}
				507	}
				508	}