blob: 7323607bd95a485cc9a4ae21794638885149cf32 [file] [log] [blame]
Chris Daltonc3cb0992020-11-02 10:52:26 -07001/*
2 * Copyright 2020 Google LLC.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef GrVx_DEFINED
9#define GrVx_DEFINED
10
Chris Daltond461f6e2020-11-23 13:51:06 -070011#include "include/core/SkTypes.h"
Chris Daltonc3cb0992020-11-02 10:52:26 -070012#include "include/private/SkVx.h"
13
14// grvx is Ganesh's addendum to skvx, Skia's SIMD library. Here we introduce functions that are
15// approximate and/or have LSB differences from platform to platform (e.g., by using hardware FMAs
16// when available). When a function is approximate, its error range is well documented and tested.
17namespace grvx {
18
19// Allow floating point contraction. e.g., allow a*x + y to be compiled to a single FMA even though
20// it introduces LSB differences on platforms that don't have an FMA instruction.
21#if defined(__clang__)
22 #pragma STDC FP_CONTRACT ON
23#endif
24
25// Use familiar type names and functions from SkSL and GLSL.
26template<int N> using vec = skvx::Vec<N, float>;
27using float2 = vec<2>;
28using float4 = vec<4>;
29
30template<int N> using ivec = skvx::Vec<N, int32_t>;
31using int2 = ivec<2>;
32using int4 = ivec<4>;
33
34template<int N> using uvec = skvx::Vec<N, uint32_t>;
35using uint2 = uvec<2>;
36using uint4 = uvec<4>;
37
Chris Daltond461f6e2020-11-23 13:51:06 -070038static SK_ALWAYS_INLINE float dot(float2 a, float2 b) {
Chris Daltonc3cb0992020-11-02 10:52:26 -070039 float2 ab = a*b;
40 return ab[0] + ab[1];
41}
42
Chris Daltond461f6e2020-11-23 13:51:06 -070043static SK_ALWAYS_INLINE float cross(float2 a, float2 b) {
Chris Daltonc3cb0992020-11-02 10:52:26 -070044 float2 x = a*skvx::shuffle<1,0>(b);
45 return x[0] - x[1];
46}
47
48// Returns f*m + a. The actual implementation may or may not be fused, depending on hardware
49// support. We call this method "fast_madd" to draw attention to the fact that the operation may
50// give different results on different platforms.
Chris Daltond461f6e2020-11-23 13:51:06 -070051template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) {
Chris Daltonc3cb0992020-11-02 10:52:26 -070052#if FP_FAST_FMAF
53 return skvx::fma(f,m,a);
54#else
55 return f*m + a;
56#endif
57}
58
59// Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial:
60//
61// acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
62//
63// See: https://stackoverflow.com/a/36387954
64//
65// For a proof of max error, see the "grvx_approx_acos" unit test.
66//
67// NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are
68// infinite at -1 and 1). So the input must still be clamped between -1 and 1.
Chris Daltonbb33be22021-02-24 16:30:34 -070069#define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
Chris Daltond461f6e2020-11-23 13:51:06 -070070template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) {
Chris Dalton356cef32020-12-01 02:11:24 -070071 constexpr static float a = -0.939115566365855f;
72 constexpr static float b = 0.9217841528914573f;
73 constexpr static float c = -1.2845906244690837f;
74 constexpr static float d = 0.295624144969963174f;
75 constexpr static float pi_over_2 = 1.5707963267948966f;
Chris Daltonc3cb0992020-11-02 10:52:26 -070076 vec<N> xx = x*x;
Chris Dalton356cef32020-12-01 02:11:24 -070077 vec<N> numer = fast_madd<N>(b,xx,a);
78 vec<N> denom = fast_madd<N>(xx, fast_madd<N>(d,xx,c), 1);
79 return fast_madd<N>(x, numer/denom, pi_over_2);
Chris Daltonc3cb0992020-11-02 10:52:26 -070080}
81
Chris Dalton356cef32020-12-01 02:11:24 -070082// Approximates the angle between vectors a and b within .96 degrees (GRVX_FAST_ACOS_MAX_ERROR).
83// a (and b) represent "N" (Nx2/2) 2d vectors in SIMD, with the x values found in a.lo, and the
84// y values in a.hi.
Chris Dalton6bacd9f2020-11-02 16:12:12 -070085//
Chris Dalton356cef32020-12-01 02:11:24 -070086// Due to fp32 overflow, this method is only valid for magnitudes in the range (2^-31, 2^31)
87// exclusive. Results are undefined if the inputs fall outside this range.
Chris Dalton6bacd9f2020-11-02 16:12:12 -070088//
89// NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately.
90// i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))".
Chris Dalton356cef32020-12-01 02:11:24 -070091template<int Nx2>
92SK_ALWAYS_INLINE vec<Nx2/2> approx_angle_between_vectors(vec<Nx2> a, vec<Nx2> b) {
93 auto aa=a*a, bb=b*b, ab=a*b;
94 auto cosTheta = (ab.lo + ab.hi) / skvx::sqrt((aa.lo + aa.hi) * (bb.lo + bb.hi));
Chris Daltonc3cb0992020-11-02 10:52:26 -070095 // Clamp cosTheta such that if it is NaN (e.g., if a or b was 0), then we return acos(1) = 0.
96 cosTheta = skvx::max(skvx::min(1, cosTheta), -1);
97 return approx_acos(cosTheta);
98}
99
Chris Daltond461f6e2020-11-23 13:51:06 -0700100// De-interleaving load of 4 vectors.
101//
102// WARNING: These are really only supported well on NEON. Consider restructuring your data before
103// resorting to these methods.
104template<typename T>
105SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b,
106 skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) {
107 a.val = v[0];
108 b.val = v[1];
109 c.val = v[2];
110 d.val = v[3];
111}
112template<int N, typename T>
113SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
114strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, skvx::Vec<N,T>& c,
115 skvx::Vec<N,T>& d) {
116 strided_load4(v, a.lo, b.lo, c.lo, d.lo);
117 strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
118}
119#if !defined(SKNX_NO_SIMD)
120#if defined(__ARM_NEON)
121#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
122template<> \
123SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, \
124 skvx::Vec<N,T>& c, skvx::Vec<N,T>& d) { \
125 auto mat = VLD(v); \
126 a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
127 b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
128 c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \
129 d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \
130}
131IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32);
132IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16);
133IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8);
134IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32);
135IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16);
136IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8);
137IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32);
138IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32);
139IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16);
140IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8);
141IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32);
142IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16);
143IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8);
144IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32);
145#undef IMPL_LOAD4_TRANSPOSED
146#elif defined(__SSE__)
147template<>
148SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) {
149 using skvx::bit_pun;
150 __m128 a_ = _mm_loadu_ps(v);
151 __m128 b_ = _mm_loadu_ps(v+4);
152 __m128 c_ = _mm_loadu_ps(v+8);
153 __m128 d_ = _mm_loadu_ps(v+12);
154 _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
155 a = bit_pun<float4>(a_);
156 b = bit_pun<float4>(b_);
157 c = bit_pun<float4>(c_);
158 d = bit_pun<float4>(d_);
159}
160#endif
161#endif
162
163// De-interleaving load of 2 vectors.
164//
165// WARNING: These are really only supported well on NEON. Consider restructuring your data before
166// resorting to these methods.
167template<typename T>
168SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) {
169 a.val = v[0];
170 b.val = v[1];
171}
172template<int N, typename T>
173SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
174strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) {
175 strided_load2(v, a.lo, b.lo);
176 strided_load2(v + 2*(N/2), a.hi, b.hi);
177}
178#if !defined(SKNX_NO_SIMD)
179#if defined(__ARM_NEON)
180#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
181template<> \
182SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \
183 auto mat = VLD(v); \
184 a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
185 b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
186}
187IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32);
188IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16);
189IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8);
190IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32);
191IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16);
192IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8);
193IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32);
194IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32);
195IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16);
196IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8);
197IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32);
198IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16);
199IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8);
200IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32);
201#undef IMPL_LOAD2_TRANSPOSED
202#endif
203#endif
204
Chris Daltonc3cb0992020-11-02 10:52:26 -0700205#if defined(__clang__)
206 #pragma STDC FP_CONTRACT DEFAULT
207#endif
208
209}; // namespace grvx
210
211#endif