Blame - src/opts/SkBlitRow_opts.h - platform/external/skia

blob: b35849263b09a1225c39cc7d762fe46f2225ba84 [file] [log] [blame]

mtklein	4a37d08	2015-09-10 10:38:02 -0700	[diff] [blame]	1	/*
				2	* Copyright 2015 Google Inc.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#ifndef SkBlitRow_opts_DEFINED
				9	#define SkBlitRow_opts_DEFINED
				10
				11	#include "Sk4px.h"
Cary Clark	a4083c9	2017-09-15 11:59:23 -0400	[diff] [blame]	12	#include "SkColorData.h"
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	13	#include "SkMSAN.h"
				14
				15	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
				16	#include "SkColor_opts_SSE2.h"
				17	#endif
mtklein	4a37d08	2015-09-10 10:38:02 -0700	[diff] [blame]	18
				19	namespace SK_OPTS_NS {
				20
				21	// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
				22	// It's not quite perfect, but it's never wrong in the interesting edge cases,
				23	// and it's quite a bit faster than blend_perfect.
				24	//
				25	// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one.
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	26	static inline
				27	void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
mtklein	4a37d08	2015-09-10 10:38:02 -0700	[diff] [blame]	28	unsigned invA = 255 - SkGetPackedA32(color);
				29	invA += invA >> 7;
				30	SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally.
				31
				32	Sk16h colorHighAndRound = Sk4px::DupPMColor(color).widenHi() + Sk16h(128);
				33	Sk16b invA_16x(invA);
				34
				35	Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
				36	return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
				37	});
				38	}
				39
Matteo Franchin	a132c38	2017-05-26 18:56:51 +0100	[diff] [blame]	40	#if defined(SK_ARM_HAS_NEON)
				41
				42	// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
				43	// y[i] are the i-th lanes of the corresponding NEON vectors.
				44	static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
				45	uint16x8_t prod = vmull_u8(x, y);
				46	return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
				47	}
				48
				49	// The implementations of SkPMSrcOver below perform alpha blending consistently with
				50	// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
				51	//
				52	// result_i = src_i + rint(g(src_alpha, dst_i))
				53	//
				54	// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
				55
				56	// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
				57	// of the same color component for 8 consecutive pixels. The result of this function follows the
				58	// same convention.
				59	static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
				60	uint8x8_t nalphas = vmvn_u8(src.val[3]);
				61	uint8x8x4_t result;
				62	result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0]));
				63	result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1]));
				64	result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2]));
				65	result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3]));
				66	return result;
				67	}
				68
				69	// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
				70	// pixels. The return value follows the same convention.
				71	static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
				72	const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
				73	uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
				74	return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
				75	}
				76
				77	#endif
				78
Mike Klein	cd71f11	2017-08-23 11:11:55 -0400	[diff] [blame]	79	/not static/ inline
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	80	void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
				81	SkASSERT(alpha == 0xFF);
				82	sk_msan_assert_initialized(src, src+len);
				83
				84	#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
				85	while (len >= 16) {
				86	// Load 16 source pixels.
				87	auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
				88	s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
				89	s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
				90	s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
				91
				92	const auto alphaMask = _mm_set1_epi32(0xFF000000);
				93
				94	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
				95	if (_mm_testz_si128(ORed, alphaMask)) {
				96	// All 16 source pixels are transparent. Nothing to do.
				97	src += 16;
				98	dst += 16;
				99	len -= 16;
				100	continue;
				101	}
				102
				103	auto d0 = (__m128i*)(dst) + 0,
				104	d1 = (__m128i*)(dst) + 1,
				105	d2 = (__m128i*)(dst) + 2,
				106	d3 = (__m128i*)(dst) + 3;
				107
				108	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
				109	if (_mm_testc_si128(ANDed, alphaMask)) {
				110	// All 16 source pixels are opaque. SrcOver becomes Src.
				111	_mm_storeu_si128(d0, s0);
				112	_mm_storeu_si128(d1, s1);
				113	_mm_storeu_si128(d2, s2);
				114	_mm_storeu_si128(d3, s3);
				115	src += 16;
				116	dst += 16;
				117	len -= 16;
				118	continue;
				119	}
				120
				121	// TODO: This math is wrong.
				122	// Do SrcOver.
				123	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
				124	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
				125	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
				126	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
				127	src += 16;
				128	dst += 16;
				129	len -= 16;
				130	}
				131
				132	#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
				133	while (len >= 16) {
				134	// Load 16 source pixels.
				135	auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
				136	s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
				137	s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
				138	s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
				139
				140	const auto alphaMask = _mm_set1_epi32(0xFF000000);
				141
				142	auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
				143	if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
				144	_mm_setzero_si128()))) {
				145	// All 16 source pixels are transparent. Nothing to do.
				146	src += 16;
				147	dst += 16;
				148	len -= 16;
				149	continue;
				150	}
				151
				152	auto d0 = (__m128i*)(dst) + 0,
				153	d1 = (__m128i*)(dst) + 1,
				154	d2 = (__m128i*)(dst) + 2,
				155	d3 = (__m128i*)(dst) + 3;
				156
				157	auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
				158	if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
				159	alphaMask))) {
				160	// All 16 source pixels are opaque. SrcOver becomes Src.
				161	_mm_storeu_si128(d0, s0);
				162	_mm_storeu_si128(d1, s1);
				163	_mm_storeu_si128(d2, s2);
				164	_mm_storeu_si128(d3, s3);
				165	src += 16;
				166	dst += 16;
				167	len -= 16;
				168	continue;
				169	}
				170
				171	// TODO: This math is wrong.
				172	// Do SrcOver.
				173	_mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
				174	_mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
				175	_mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
				176	_mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
				177
				178	src += 16;
				179	dst += 16;
				180	len -= 16;
				181	}
				182
				183	#elif defined(SK_ARM_HAS_NEON)
Matteo Franchin	a132c38	2017-05-26 18:56:51 +0100	[diff] [blame]	184	// Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
				185	// underperformed on some of the platforms under test for inputs with frequent transitions of
				186	// alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
				187	// revisiting the situation in the future.
				188	while (len >= 8) {
				189	// Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
				190	// for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
				191	// pixels).
				192	uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
				193	src += 8;
				194	len -= 8;
				195
				196	// We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
				197	// are all transparent), the second when all alphas are fully set (they are all opaque).
				198	uint8x8_t alphas = src_col.val[3];
				199	uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
				200	if (alphas_u64 == 0) {
				201	// All pixels transparent.
				202	dst += 8;
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	203	continue;
				204	}
				205
Matteo Franchin	a132c38	2017-05-26 18:56:51 +0100	[diff] [blame]	206	if (~alphas_u64 == 0) {
				207	// All pixels opaque.
				208	vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
				209	dst += 8;
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	210	continue;
				211	}
				212
Matteo Franchin	a132c38	2017-05-26 18:56:51 +0100	[diff] [blame]	213	uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
				214	vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
				215	dst += 8;
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	216	}
Matteo Franchin	a132c38	2017-05-26 18:56:51 +0100	[diff] [blame]	217
				218	// Deal with leftover pixels.
				219	for (; len >= 2; len -= 2, src += 2, dst += 2) {
				220	uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
				221	uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
				222	vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
				223	}
				224
				225	if (len != 0) {
				226	uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(dst), vcreate_u8(src));
				227	vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
				228	}
				229	return;
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	230	#endif
				231
				232	while (len-- > 0) {
mtklein	3e31812	2016-06-17 13:47:53 -0700	[diff] [blame]	233	// This 0xFF000000 is not semantically necessary, but for compatibility
				234	// with chromium:611002 we need to keep it until we figure out where
				235	// the non-premultiplied src values (like 0x00FFFFFF) are coming from.
				236	// TODO(mtklein): sort this out and assert *src is premul here.
				237	if (*src & 0xFF000000) {
mtklein	b4a7dc9	2016-03-23 06:29:12 -0700	[diff] [blame]	238	dst = (src >= 0xFF000000) ? src : SkPMSrcOver(src, *dst);
				239	}
				240	src++;
				241	dst++;
				242	}
				243	}
				244
mtklein	4a37d08	2015-09-10 10:38:02 -0700	[diff] [blame]	245	} // SK_OPTS_NS
				246
				247	#endif//SkBlitRow_opts_DEFINED