Blame - src/opts/SkBitmapProcState_opts.h - platform/external/skia

blob: adc03c51e9d3ed8d892ad2d24187ce6857c3b2a6 [file] [log] [blame]

Mike Klein	a2187bf	2018-11-16 12:22:05 -0500	[diff] [blame]	1	/*
				2	* Copyright 2018 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#ifndef SkBitmapProcState_opts_DEFINED
				9	#define SkBitmapProcState_opts_DEFINED
				10
				11	#include "SkBitmapProcState.h"
				12
				13	// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
				14	//
				15	// Only S32_alpha_D32_filter_DX exploits instructions beyond
				16	// our common baseline SSE2/NEON instruction sets, so that's
				17	// all that lives here.
				18	//
				19	// The rest are scattershot at the moment but I want to get them
				20	// all migrated to be normal code inside SkBitmapProcState.cpp.
				21
				22	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
				23	#include <immintrin.h>
				24	#elif defined(SK_ARM_HAS_NEON)
				25	#include <arm_neon.h>
				26	#endif
				27
				28	namespace SK_OPTS_NS {
				29
				30	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
				31	// This same basic packing scheme is used throughout the file.
				32	static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
				33	// The top 14 bits are the integer coordinate x0 or y0.
				34	*v0 = packed >> 18;
				35
				36	// The bottom 14 bits are the integer coordinate x1 or y1.
				37	*v1 = packed & 0x3fff;
				38
				39	// The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
				40	*w = (packed >> 14) & 0xf;
				41	}
				42
				43	// As above, 4x.
				44	static void decode_packed_coordinates_and_weight(__m128i packed,
				45	int v0[4], int v1[4], __m128i* w) {
				46	_mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
				47	_mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
				48	*w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
				49	}
				50
				51	// This is the crux of the SSSE3 implementation,
				52	// interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
				53	static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
				54	uint32_t B0, uint32_t B1,
				55	const __m128i& interlaced_x_weights) {
				56	// _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
				57	//
				58	// It takes two arguments interlaced byte-wise:
				59	// - first arg: [ x,y, ... 7 more pairs of 8-bit values ...]
				60	// - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
				61	// and returns 8 16-bit values: [ xz + yw, ... 7 more 16-bit values ... ].
				62	//
				63	// That's why we go to all this trouble to make interlaced_x_weights,
				64	// and here we're interlacing A0 with A1, B0 with B1 to match.
				65
				66	__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
				67	interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
				68
				69	return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
				70	interlaced_x_weights);
				71	}
				72
				73	// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
				74	// Returns two pixels, with each channel in a 16-bit lane of the __m128i.
				75	static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
				76	uint32_t A2, uint32_t A3,
				77	uint32_t B0, uint32_t B1,
				78	uint32_t B2, uint32_t B3,
				79	const __m128i& interlaced_x_weights,
				80	int wy) {
				81	// The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
				82	const __m128i wy1 = _mm_set1_epi16(wy),
				83	wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
				84
				85	// First interpolate in X,
				86	// leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
				87	__m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
				88	row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
				89
				90	// Interpolate in Y across the two rows,
				91	// then scale everything down by the maximum total weight 16x16 = 256.
				92	return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
				93	_mm_mullo_epi16(row1, wy1)), 8);
				94	}
				95
				96	/not static/ inline
				97	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
				98	const uint32_t* xy, int count, uint32_t* colors) {
				99	SkASSERT(count > 0 && colors != nullptr);
				100	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
				101	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
				102
				103	int alpha = s.fAlphaScale;
				104
				105	// Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].)
				106	auto scale_by_alpha = [alpha](const __m128i& px) {
				107	return alpha == 256 ? px
				108	: _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
				109	};
				110
				111	// We're in _DX_ mode here, so we're only varying in X.
				112	// That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
				113	// All the other entries in xy will be pairs of X coordinates and the X weight.
				114	int y0, y1, wy;
				115	decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
				116
				117	auto row0 = (const uint32_t)((const uint8_t)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
				118	row1 = (const uint32_t)((const uint8_t)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
				119
				120	while (count >= 4) {
				121	// We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
				122	const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
				123
				124	int x0[4],
				125	x1[4];
				126	__m128i wx;
				127	decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
				128
				129	// Splat out each x weight wx four times (one for each pixel channel) as wx1,
				130	// and sixteen minus that as the weight for x0, wx0.
				131	__m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
				132	wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
				133
				134	// We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
				135	__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
				136	interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
				137
				138	// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
				139	// from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
				140	__m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
				141	row1[x0[0]], row1[x1[0]],
				142	row0[x0[1]], row0[x1[1]],
				143	row1[x0[1]], row1[x1[1]],
				144	interlaced_x_weights_AB, wy);
				145
				146	// Once more with the other half of the x-weights for two more pixels C,D.
				147	__m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
				148	row1[x0[2]], row1[x1[2]],
				149	row0[x0[3]], row0[x1[3]],
				150	row1[x0[3]], row1[x1[3]],
				151	interlaced_x_weights_CD, wy);
				152
				153	// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
				154	_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
				155	scale_by_alpha(CD)));
				156	xy += 4;
				157	colors += 4;
				158	count -= 4;
				159	}
				160
				161	while (count --> 0) {
				162	// This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
				163	int x0, x1, wx;
				164	decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
				165
				166	// As above, splat out wx four times as wx1, and sixteen minus that as wx0.
				167	__m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
				168	wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
				169
				170	__m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
				171
				172	__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
				173	row1[x0], row1[x1],
				174	0, 0,
				175	0, 0,
				176	interlaced_x_weights_A, wy);
				177
				178	*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
				179	}
				180	}
				181
				182
				183	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
				184
				185	// TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
				186
				187	/not static/ inline
				188	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
				189	const uint32_t* xy, int count, uint32_t* colors) {
				190	SkASSERT(count > 0 && colors != nullptr);
				191	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
				192	SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
				193	SkASSERT(s.fAlphaScale <= 256);
				194
				195	const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
				196	size_t rb = s.fPixmap.rowBytes();
				197	uint32_t XY = *xy++;
				198	unsigned y0 = XY >> 14;
				199	const uint32_t* row0 = reinterpret_cast<const uint32_t>(srcAddr + (y0 >> 4) rb);
				200	const uint32_t* row1 = reinterpret_cast<const uint32_t>(srcAddr + (XY & 0x3FFF) rb);
				201	unsigned subY = y0 & 0xF;
				202
				203	// ( 0, 0, 0, 0, 0, 0, 0, 16)
				204	__m128i sixteen = _mm_cvtsi32_si128(16);
				205
				206	// ( 0, 0, 0, 0, 16, 16, 16, 16)
				207	sixteen = _mm_shufflelo_epi16(sixteen, 0);
				208
				209	// ( 0, 0, 0, 0, 0, 0, 0, y)
				210	__m128i allY = _mm_cvtsi32_si128(subY);
				211
				212	// ( 0, 0, 0, 0, y, y, y, y)
				213	allY = _mm_shufflelo_epi16(allY, 0);
				214
				215	// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
				216	__m128i negY = _mm_sub_epi16(sixteen, allY);
				217
				218	// (16-y, 16-y, 16-y, 16-y, y, y, y, y)
				219	allY = _mm_unpacklo_epi64(allY, negY);
				220
				221	// (16, 16, 16, 16, 16, 16, 16, 16 )
				222	sixteen = _mm_shuffle_epi32(sixteen, 0);
				223
				224	// ( 0, 0, 0, 0, 0, 0, 0, 0)
				225	__m128i zero = _mm_setzero_si128();
				226
				227	// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
				228	__m128i alpha = _mm_set1_epi16(s.fAlphaScale);
				229
				230	do {
				231	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				232	unsigned x0 = XX >> 18;
				233	unsigned x1 = XX & 0x3FFF;
				234
				235	// (0, 0, 0, 0, 0, 0, 0, x)
				236	__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
				237
				238	// (0, 0, 0, 0, x, x, x, x)
				239	allX = _mm_shufflelo_epi16(allX, 0);
				240
				241	// (x, x, x, x, x, x, x, x)
				242	allX = _mm_shuffle_epi32(allX, 0);
				243
				244	// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
				245	__m128i negX = _mm_sub_epi16(sixteen, allX);
				246
				247	// Load 4 samples (pixels).
				248	__m128i a00 = _mm_cvtsi32_si128(row0[x0]);
				249	__m128i a01 = _mm_cvtsi32_si128(row0[x1]);
				250	__m128i a10 = _mm_cvtsi32_si128(row1[x0]);
				251	__m128i a11 = _mm_cvtsi32_si128(row1[x1]);
				252
				253	// (0, 0, a00, a10)
				254	__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
				255
				256	// Expand to 16 bits per component.
				257	a00a10 = _mm_unpacklo_epi8(a00a10, zero);
				258
				259	// ((a00 * (16-y)), (a10 * y)).
				260	a00a10 = _mm_mullo_epi16(a00a10, allY);
				261
				262	// (a00 * (16-y) * (16-x), a10 * y * (16-x)).
				263	a00a10 = _mm_mullo_epi16(a00a10, negX);
				264
				265	// (0, 0, a01, a10)
				266	__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
				267
				268	// Expand to 16 bits per component.
				269	a01a11 = _mm_unpacklo_epi8(a01a11, zero);
				270
				271	// (a01 * (16-y)), (a11 * y)
				272	a01a11 = _mm_mullo_epi16(a01a11, allY);
				273
				274	// (a01 * (16-y) * x), (a11 * y * x)
				275	a01a11 = _mm_mullo_epi16(a01a11, allX);
				276
				277	// (a00w00 + a01w01, a10w10 + a11w11)
				278	__m128i sum = _mm_add_epi16(a00a10, a01a11);
				279
				280	// (DC, a00w00 + a01w01)
				281	__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
				282
				283	// (DC, a00w00 + a01w01 + a10w10 + a11w11)
				284	sum = _mm_add_epi16(sum, shifted);
				285
				286	// Divide each 16 bit component by 256.
				287	sum = _mm_srli_epi16(sum, 8);
				288
				289	// Multiply by alpha.
				290	sum = _mm_mullo_epi16(sum, alpha);
				291
				292	// Divide each 16 bit component by 256.
				293	sum = _mm_srli_epi16(sum, 8);
				294
				295	// Pack lower 4 16 bit values of sum into lower 4 bytes.
				296	sum = _mm_packus_epi16(sum, zero);
				297
				298	// Extract low int and store.
				299	*colors++ = _mm_cvtsi128_si32(sum);
				300	} while (--count > 0);
				301	}
				302
				303	#else
				304
				305	// The NEON code only actually differs from the portable code in the
				306	// filtering step after we've loaded all four pixels we want to bilerp.
				307
				308	#if defined(SK_ARM_HAS_NEON)
				309	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
				310	SkPMColor a00, SkPMColor a01,
				311	SkPMColor a10, SkPMColor a11,
				312	SkPMColor *dst,
				313	uint16_t scale) {
				314	uint8x8_t vy, vconst16_8, v16_y, vres;
				315	uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
				316	uint32x2_t va0, va1;
				317	uint16x8_t tmp1, tmp2;
				318
				319	vy = vdup_n_u8(y); // duplicate y into vy
				320	vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
				321	v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
				322
				323	va0 = vdup_n_u32(a00); // duplicate a00
				324	va1 = vdup_n_u32(a10); // duplicate a10
				325	va0 = vset_lane_u32(a01, va0, 1); // set top to a01
				326	va1 = vset_lane_u32(a11, va1, 1); // set top to a11
				327
				328	tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01\|a00] * (16-y)
				329	tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11\|a10] * y
				330
				331	vx = vdup_n_u16(x); // duplicate x into vx
				332	vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
				333	v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
				334
				335	tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
				336	tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
				337	tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
				338	tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
				339
				340	vscale = vdup_n_u16(scale); // duplicate scale
				341	tmp = vshr_n_u16(tmp, 8); // shift down result by 8
				342	tmp = vmul_u16(tmp, vscale); // multiply result by scale
				343
				344	vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
				345	vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
				346	}
				347	#else
				348	static void filter_and_scale_by_alpha(unsigned x, unsigned y,
				349	SkPMColor a00, SkPMColor a01,
				350	SkPMColor a10, SkPMColor a11,
				351	SkPMColor* dstColor,
				352	unsigned alphaScale) {
				353	SkASSERT((unsigned)x <= 0xF);
				354	SkASSERT((unsigned)y <= 0xF);
				355	SkASSERT(alphaScale <= 256);
				356
				357	int xy = x * y;
				358	const uint32_t mask = 0xFF00FF;
				359
				360	int scale = 256 - 16y - 16x + xy;
				361	uint32_t lo = (a00 & mask) * scale;
				362	uint32_t hi = ((a00 >> 8) & mask) * scale;
				363
				364	scale = 16*x - xy;
				365	lo += (a01 & mask) * scale;
				366	hi += ((a01 >> 8) & mask) * scale;
				367
				368	scale = 16*y - xy;
				369	lo += (a10 & mask) * scale;
				370	hi += ((a10 >> 8) & mask) * scale;
				371
				372	lo += (a11 & mask) * xy;
				373	hi += ((a11 >> 8) & mask) * xy;
				374
				375	// TODO: if (alphaScale < 256) ...
				376	lo = ((lo >> 8) & mask) * alphaScale;
				377	hi = ((hi >> 8) & mask) * alphaScale;
				378
				379	*dstColor = ((lo >> 8) & mask) \| (hi & ~mask);
				380	}
				381	#endif
				382
				383
				384	// TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
				385
				386	/not static/ inline
				387	void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
				388	const uint32_t* xy, int count, SkPMColor* colors) {
				389	SkASSERT(count > 0 && colors != nullptr);
				390	SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
				391	SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
				392	SkASSERT(s.fAlphaScale <= 256);
				393
				394	unsigned alphaScale = s.fAlphaScale;
				395
				396	const char* srcAddr = (const char*)s.fPixmap.addr();
				397	size_t rb = s.fPixmap.rowBytes();
				398	unsigned subY;
				399	const SkPMColor* row0;
				400	const SkPMColor* row1;
				401
				402	// setup row ptrs and update proc_table
				403	{
				404	uint32_t XY = *xy++;
				405	unsigned y0 = XY >> 14;
				406	row0 = (const SkPMColor)(srcAddr + (y0 >> 4) rb);
				407	row1 = (const SkPMColor)(srcAddr + (XY & 0x3FFF) rb);
				408	subY = y0 & 0xF;
				409	}
				410
				411	do {
				412	uint32_t XX = *xy++; // x0:14 \| 4 \| x1:14
				413	unsigned x0 = XX >> 14;
				414	unsigned x1 = XX & 0x3FFF;
				415	unsigned subX = x0 & 0xF;
				416	x0 >>= 4;
				417
				418	filter_and_scale_by_alpha(subX, subY,
				419	row0[x0], row0[x1],
				420	row1[x0], row1[x1],
				421	colors,
				422	alphaScale);
				423	colors += 1;
				424
				425	} while (--count != 0);
				426	}
				427
				428	#endif
				429
				430	} // namespace SK_OPTS_NS
				431
				432	#endif