Blame - src/splicer/SkSplicer_stages_lowp.cpp - platform/external/skia

blob: ef3ab4032cd426f521bbbc1f9097cd80c43cecc6 [file] [log] [blame]

Mike Klein	f720098	2017-01-15 18:14:07 -0500	[diff] [blame^]	1	/*
				2	* Copyright 2017 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	// This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file
				9	// first before trying to understand this one. We'll note only key differences here.
				10
				11	#include "SkSplicer_shared.h"
				12	#include <string.h>
				13
				14	#if !defined(__clang__)
				15	#error This file is not like the rest of Skia. It must be compiled with clang.
				16	#endif
				17
				18	#if defined(__aarch64__)
				19	#include <arm_neon.h>
				20
				21	// In this file, F is a vector of SkFixed15.
				22	// See SkFixed15.h for notes on its various operations.
				23	struct F {
				24	using V = uint16_t __attribute__((ext_vector_type(8)));
				25
				26	V vec;
				27
				28	F(uint16x8_t v) : vec(v) {}
				29	operator V() const { return vec; }
				30
				31	F() = default;
				32	F(uint16_t v) : vec(v) {}
				33
				34	F operator+(F o) const { return vqaddq_u16(vec, o.vec); }
				35	F operator-(F o) const { return vqsubq_u16(vec, o.vec); }
				36	F operator*(F o) const {
				37	return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)),
				38	vandq_s16(vec, o.vec), 15);
				39	}
				40	F operator>>(int k) const { return vec >> k; }
				41	F operator<<(int k) const { return vec << k; }
				42	};
				43	static F min(F a, F b) { return vminq_u16(a,b); }
				44	static F max(F a, F b) { return vmaxq_u16(a,b); }
				45
				46	#elif defined(__ARM_NEON__)
				47	#if defined(__thumb2__) \|\| !defined(__ARM_ARCH_7A__) \|\| !defined(__ARM_VFPV4__)
				48	#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
				49	#endif
				50	#include <arm_neon.h>
				51
				52	struct F {
				53	using V = uint16_t __attribute__((ext_vector_type(4)));
				54
				55	V vec;
				56
				57	F(uint16x4_t v) : vec(v) {}
				58	operator V() const { return vec; }
				59
				60	F() = default;
				61	F(uint16_t v) : vec(v) {}
				62
				63	F operator+(F o) const { return vqadd_u16(vec, o.vec); }
				64	F operator-(F o) const { return vqsub_u16(vec, o.vec); }
				65	F operator*(F o) const {
				66	return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)),
				67	vand_s16(vec, o.vec), 15);
				68	}
				69	F operator>>(int k) const { return vec >> k; }
				70	F operator<<(int k) const { return vec << k; }
				71	};
				72	static F min(F a, F b) { return vmin_u16(a,b); }
				73	static F max(F a, F b) { return vmax_u16(a,b); }
				74
				75	#else
				76	#if !defined(__AVX2__) \|\| !defined(__FMA__) \|\| !defined(__F16C__)
				77	#error On x86, compile with -mavx2 -mfma -mf16c.
				78	#endif
				79	#include <immintrin.h>
				80
				81	struct F {
				82	using V = uint16_t __attribute__((ext_vector_type(16)));
				83
				84	V vec;
				85
				86	F(__m256 v) : vec(v) {}
				87	operator V() const { return vec; }
				88
				89	F() = default;
				90	F(uint16_t v) : vec(v) {}
				91
				92	F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); }
				93	F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); }
				94	F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); }
				95	F operator>>(int k) const { return vec >> k; }
				96	F operator<<(int k) const { return vec << k; }
				97	};
				98	static F min(F a, F b) { return _mm256_min_epu16(a,b); }
				99	static F max(F a, F b) { return _mm256_max_epu16(a,b); }
				100	#endif
				101
				102	// No platform actually supports FMA for SkFixed15.
				103	// This fma() method just makes it easier to port stages to lowp.
				104	static F fma(F f, F m, F a) { return f*m+a; }
				105
				106	#if defined(__ARM_NEON__)
				107	#define C extern "C" __attribute__((pcs("aapcs-vfp")))
				108	#else
				109	#define C extern "C"
				110	#endif
				111
				112	// We use a set of constants suitable for SkFixed15 math.
				113	using K = const SkSplicer_constants_lowp;
				114	using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
				115
				116	// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
				117	// registers. This shouldn't affect performance or how you write STAGEs in any way.
				118	C void done(size_t, size_t, void, K, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V);
				119
				120	#define STAGE(name) \
				121	static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
				122	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
				123	C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \
				124	F::V R, F::V G, F::V B, F::V A, \
				125	F::V DR, F::V DG, F::V DB, F::V DA) { \
				126	F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \
				127	name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
				128	done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
				129	} \
				130	static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
				131	F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
				132
				133	STAGE(inc_x) {
				134	x += sizeof(F) / sizeof(uint16_t);
				135	}
				136
				137	STAGE(clear) {
				138	r = g = b = a = 0;
				139	}
				140
				141	STAGE(plus_) {
				142	r = r + dr;
				143	g = g + dg;
				144	b = b + db;
				145	a = a + da;
				146	}
				147
				148	STAGE(srcover) {
				149	auto A = F(k->_1) - a;
				150	r = fma(dr, A, r);
				151	g = fma(dg, A, g);
				152	b = fma(db, A, b);
				153	a = fma(da, A, a);
				154	}
				155	STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
				156
				157	STAGE(clamp_1) {
				158	r = min(r, k->_1);
				159	g = min(g, k->_1);
				160	b = min(b, k->_1);
				161	a = min(a, k->_1);
				162	}
				163
				164	STAGE(clamp_a) {
				165	a = min(a, k->_1);
				166	r = min(r, a);
				167	g = min(g, a);
				168	b = min(b, a);
				169	}
				170
				171	STAGE(swap) {
				172	auto swap = [](F& v, F& dv) {
				173	auto tmp = v;
				174	v = dv;
				175	dv = tmp;
				176	};
				177	swap(r, dr);
				178	swap(g, dg);
				179	swap(b, db);
				180	swap(a, da);
				181	}
				182	STAGE(move_src_dst) {
				183	dr = r;
				184	dg = g;
				185	db = b;
				186	da = a;
				187	}
				188	STAGE(move_dst_src) {
				189	r = dr;
				190	g = dg;
				191	b = db;
				192	a = da;
				193	}
				194
				195	STAGE(premul) {
				196	r = r * a;
				197	g = g * a;
				198	b = b * a;
				199	}
				200
				201	STAGE(load_8888) {
				202	auto ptr = (const uint32_t*)ctx + x;
				203
				204	#if defined(__aarch64__)
				205	auto to_fixed15 = [](uint8x8_t u8) {
				206	// u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
				207	//
				208	// Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
				209	// and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
				210	auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
				211	return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
				212	};
				213
				214	uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
				215	r = to_fixed15(rgba.val[0]);
				216	g = to_fixed15(rgba.val[1]);
				217	b = to_fixed15(rgba.val[2]);
				218	a = to_fixed15(rgba.val[3]);
				219
				220	#elif defined(__ARM_NEON__)
				221	auto to_fixed15 = [](uint8x8_t u8) {
				222	// Same as aarch64, but only keeping the bottom 4 lanes.
				223	auto u16 = vshll_n_u8(u8, 7);
				224	return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
				225	};
				226
				227	// I can't get quite the code generation I want using vld4_lane_u8(),
				228	// so we're going to drop into assembly to do the loads. :/
				229
				230	uint8x8_t R,G,B,A;
				231	asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
				232	"vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
				233	"vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
				234	"vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
				235	: "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
				236	r = to_fixed15(R);
				237	g = to_fixed15(G);
				238	b = to_fixed15(B);
				239	a = to_fixed15(A);
				240
				241	#else
				242	auto to_fixed15 = [k](__m128i u8) {
				243	F u16 = _mm256_cvtepu8_epi16(u8);
				244	return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
				245	};
				246
				247	// TODO: shorter, more confusing, faster with 256-bit loads and shuffles
				248
				249	// Load 16 interplaced pixels.
				250	auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0),
				251	_4567 = _mm_loadu_si128((const __m128i*)ptr + 1),
				252	_89AB = _mm_loadu_si128((const __m128i*)ptr + 2),
				253	_CDEF = _mm_loadu_si128((const __m128i*)ptr + 3);
				254
				255	// We've got an awful lot of unpacking to do to transpose this...
				256	auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15
				257	_2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37
				258	_8C9D = _mm_unpacklo_epi8(_89AB, _CDEF),
				259	_AEBF = _mm_unpackhi_epi8(_89AB, _CDEF);
				260
				261	auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246
				262	_1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357
				263	_8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF),
				264	_9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF);
				265
				266	auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567
				267	ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567
				268	rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
				269	ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
				270
				271	r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
				272	g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
				273	b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
				274	a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
				275	#endif
				276	}
				277
				278	STAGE(store_8888) {
				279	auto ptr = (uint32_t*)ctx + x;
				280
				281	#if defined(__aarch64__)
				282	auto from_fixed15 = [](F v) {
				283	// The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
				284	// But what's really most important is that all bytes round trip.
				285
				286	// We can do this in NEON in one instruction, a saturating narrowing right shift:
				287	return vqshrn_n_u16(v, 7);
				288	};
				289
				290	uint8x8x4_t rgba = {{
				291	from_fixed15(r),
				292	from_fixed15(g),
				293	from_fixed15(b),
				294	from_fixed15(a),
				295	}};
				296	vst4_u8((uint8_t*)ptr, rgba);
				297	#elif defined(__ARM_NEON__)
				298	auto from_fixed15 = [](F v) {
				299	// Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
				300	F whatever;
				301	return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
				302	};
				303
				304	// As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8().
				305	asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
				306	"vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
				307	"vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
				308	"vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
				309	: "+r"(ptr)
				310	: "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
				311	: "memory");
				312
				313	#else
				314	auto from_fixed15 = [](F v) {
				315	// See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
				316	// Here we take a different approach: (v saturated+ v) >> 8.
				317	v = (v+v) >> 8;
				318	return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
				319	_mm256_extracti128_si256(v, 1));
				320	};
				321
				322	auto R = from_fixed15(r),
				323	G = from_fixed15(g),
				324	B = from_fixed15(b),
				325	A = from_fixed15(a);
				326
				327	auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
				328	rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF
				329	ba_01234567 = _mm_unpacklo_epi8(B,A),
				330	ba_89ABCDEF = _mm_unpackhi_epi8(B,A);
				331	_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567));
				332	_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567));
				333	_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF));
				334	_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF));
				335	#endif
				336	}