Blame - src/gpu/GrVx.h - platform/external/skia

blob: 7323607bd95a485cc9a4ae21794638885149cf32 [file] [log] [blame]

Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	1	/*
				2	* Copyright 2020 Google LLC.
				3	*
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the LICENSE file.
				6	*/
				7
				8	#ifndef GrVx_DEFINED
				9	#define GrVx_DEFINED
				10
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	11	#include "include/core/SkTypes.h"
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	12	#include "include/private/SkVx.h"
				13
				14	// grvx is Ganesh's addendum to skvx, Skia's SIMD library. Here we introduce functions that are
				15	// approximate and/or have LSB differences from platform to platform (e.g., by using hardware FMAs
				16	// when available). When a function is approximate, its error range is well documented and tested.
				17	namespace grvx {
				18
				19	// Allow floating point contraction. e.g., allow a*x + y to be compiled to a single FMA even though
				20	// it introduces LSB differences on platforms that don't have an FMA instruction.
				21	#if defined(__clang__)
				22	#pragma STDC FP_CONTRACT ON
				23	#endif
				24
				25	// Use familiar type names and functions from SkSL and GLSL.
				26	template<int N> using vec = skvx::Vec<N, float>;
				27	using float2 = vec<2>;
				28	using float4 = vec<4>;
				29
				30	template<int N> using ivec = skvx::Vec<N, int32_t>;
				31	using int2 = ivec<2>;
				32	using int4 = ivec<4>;
				33
				34	template<int N> using uvec = skvx::Vec<N, uint32_t>;
				35	using uint2 = uvec<2>;
				36	using uint4 = uvec<4>;
				37
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	38	static SK_ALWAYS_INLINE float dot(float2 a, float2 b) {
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	39	float2 ab = a*b;
				40	return ab[0] + ab[1];
				41	}
				42
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	43	static SK_ALWAYS_INLINE float cross(float2 a, float2 b) {
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	44	float2 x = a*skvx::shuffle<1,0>(b);
				45	return x[0] - x[1];
				46	}
				47
				48	// Returns f*m + a. The actual implementation may or may not be fused, depending on hardware
				49	// support. We call this method "fast_madd" to draw attention to the fact that the operation may
				50	// give different results on different platforms.
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	51	template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) {
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	52	#if FP_FAST_FMAF
				53	return skvx::fma(f,m,a);
				54	#else
				55	return f*m + a;
				56	#endif
				57	}
				58
				59	// Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial:
				60	//
				61	// acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
				62	//
				63	// See: https://stackoverflow.com/a/36387954
				64	//
				65	// For a proof of max error, see the "grvx_approx_acos" unit test.
				66	//
				67	// NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are
				68	// infinite at -1 and 1). So the input must still be clamped between -1 and 1.
Chris Dalton	bb33be2	2021-02-24 16:30:34 -0700	[diff] [blame]	69	#define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	70	template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) {
Chris Dalton	356cef3	2020-12-01 02:11:24 -0700	[diff] [blame]	71	constexpr static float a = -0.939115566365855f;
				72	constexpr static float b = 0.9217841528914573f;
				73	constexpr static float c = -1.2845906244690837f;
				74	constexpr static float d = 0.295624144969963174f;
				75	constexpr static float pi_over_2 = 1.5707963267948966f;
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	76	vec<N> xx = x*x;
Chris Dalton	356cef3	2020-12-01 02:11:24 -0700	[diff] [blame]	77	vec<N> numer = fast_madd<N>(b,xx,a);
				78	vec<N> denom = fast_madd<N>(xx, fast_madd<N>(d,xx,c), 1);
				79	return fast_madd<N>(x, numer/denom, pi_over_2);
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	80	}
				81
Chris Dalton	356cef3	2020-12-01 02:11:24 -0700	[diff] [blame]	82	// Approximates the angle between vectors a and b within .96 degrees (GRVX_FAST_ACOS_MAX_ERROR).
				83	// a (and b) represent "N" (Nx2/2) 2d vectors in SIMD, with the x values found in a.lo, and the
				84	// y values in a.hi.
Chris Dalton	6bacd9f	2020-11-02 16:12:12 -0700	[diff] [blame]	85	//
Chris Dalton	356cef3	2020-12-01 02:11:24 -0700	[diff] [blame]	86	// Due to fp32 overflow, this method is only valid for magnitudes in the range (2^-31, 2^31)
				87	// exclusive. Results are undefined if the inputs fall outside this range.
Chris Dalton	6bacd9f	2020-11-02 16:12:12 -0700	[diff] [blame]	88	//
				89	// NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately.
				90	// i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))".
Chris Dalton	356cef3	2020-12-01 02:11:24 -0700	[diff] [blame]	91	template<int Nx2>
				92	SK_ALWAYS_INLINE vec<Nx2/2> approx_angle_between_vectors(vec<Nx2> a, vec<Nx2> b) {
				93	auto aa=aa, bb=bb, ab=a*b;
				94	auto cosTheta = (ab.lo + ab.hi) / skvx::sqrt((aa.lo + aa.hi) * (bb.lo + bb.hi));
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	95	// Clamp cosTheta such that if it is NaN (e.g., if a or b was 0), then we return acos(1) = 0.
				96	cosTheta = skvx::max(skvx::min(1, cosTheta), -1);
				97	return approx_acos(cosTheta);
				98	}
				99
Chris Dalton	d461f6e	2020-11-23 13:51:06 -0700	[diff] [blame]	100	// De-interleaving load of 4 vectors.
				101	//
				102	// WARNING: These are really only supported well on NEON. Consider restructuring your data before
				103	// resorting to these methods.
				104	template<typename T>
				105	SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b,
				106	skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) {
				107	a.val = v[0];
				108	b.val = v[1];
				109	c.val = v[2];
				110	d.val = v[3];
				111	}
				112	template<int N, typename T>
				113	SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
				114	strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, skvx::Vec<N,T>& c,
				115	skvx::Vec<N,T>& d) {
				116	strided_load4(v, a.lo, b.lo, c.lo, d.lo);
				117	strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
				118	}
				119	#if !defined(SKNX_NO_SIMD)
				120	#if defined(__ARM_NEON)
				121	#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
				122	template<> \
				123	SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, \
				124	skvx::Vec<N,T>& c, skvx::Vec<N,T>& d) { \
				125	auto mat = VLD(v); \
				126	a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
				127	b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
				128	c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \
				129	d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \
				130	}
				131	IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32);
				132	IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16);
				133	IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8);
				134	IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32);
				135	IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16);
				136	IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8);
				137	IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32);
				138	IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32);
				139	IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16);
				140	IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8);
				141	IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32);
				142	IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16);
				143	IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8);
				144	IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32);
				145	#undef IMPL_LOAD4_TRANSPOSED
				146	#elif defined(__SSE__)
				147	template<>
				148	SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) {
				149	using skvx::bit_pun;
				150	__m128 a_ = _mm_loadu_ps(v);
				151	__m128 b_ = _mm_loadu_ps(v+4);
				152	__m128 c_ = _mm_loadu_ps(v+8);
				153	__m128 d_ = _mm_loadu_ps(v+12);
				154	_MM_TRANSPOSE4_PS(a_, b_, c_, d_);
				155	a = bit_pun<float4>(a_);
				156	b = bit_pun<float4>(b_);
				157	c = bit_pun<float4>(c_);
				158	d = bit_pun<float4>(d_);
				159	}
				160	#endif
				161	#endif
				162
				163	// De-interleaving load of 2 vectors.
				164	//
				165	// WARNING: These are really only supported well on NEON. Consider restructuring your data before
				166	// resorting to these methods.
				167	template<typename T>
				168	SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) {
				169	a.val = v[0];
				170	b.val = v[1];
				171	}
				172	template<int N, typename T>
				173	SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
				174	strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) {
				175	strided_load2(v, a.lo, b.lo);
				176	strided_load2(v + 2*(N/2), a.hi, b.hi);
				177	}
				178	#if !defined(SKNX_NO_SIMD)
				179	#if defined(__ARM_NEON)
				180	#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
				181	template<> \
				182	SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \
				183	auto mat = VLD(v); \
				184	a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
				185	b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
				186	}
				187	IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32);
				188	IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16);
				189	IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8);
				190	IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32);
				191	IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16);
				192	IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8);
				193	IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32);
				194	IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32);
				195	IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16);
				196	IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8);
				197	IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32);
				198	IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16);
				199	IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8);
				200	IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32);
				201	#undef IMPL_LOAD2_TRANSPOSED
				202	#endif
				203	#endif
				204
Chris Dalton	c3cb099	2020-11-02 10:52:26 -0700	[diff] [blame]	205	#if defined(__clang__)
				206	#pragma STDC FP_CONTRACT DEFAULT
				207	#endif
				208
				209	}; // namespace grvx
				210
				211	#endif