Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2020 Google LLC. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef GrVx_DEFINED |
| 9 | #define GrVx_DEFINED |
| 10 | |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 11 | #include "include/core/SkTypes.h" |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 12 | #include "include/private/SkVx.h" |
| 13 | |
| 14 | // grvx is Ganesh's addendum to skvx, Skia's SIMD library. Here we introduce functions that are |
| 15 | // approximate and/or have LSB differences from platform to platform (e.g., by using hardware FMAs |
| 16 | // when available). When a function is approximate, its error range is well documented and tested. |
| 17 | namespace grvx { |
| 18 | |
| 19 | // Allow floating point contraction. e.g., allow a*x + y to be compiled to a single FMA even though |
| 20 | // it introduces LSB differences on platforms that don't have an FMA instruction. |
| 21 | #if defined(__clang__) |
| 22 | #pragma STDC FP_CONTRACT ON |
| 23 | #endif |
| 24 | |
| 25 | // Use familiar type names and functions from SkSL and GLSL. |
| 26 | template<int N> using vec = skvx::Vec<N, float>; |
| 27 | using float2 = vec<2>; |
| 28 | using float4 = vec<4>; |
| 29 | |
| 30 | template<int N> using ivec = skvx::Vec<N, int32_t>; |
| 31 | using int2 = ivec<2>; |
| 32 | using int4 = ivec<4>; |
| 33 | |
| 34 | template<int N> using uvec = skvx::Vec<N, uint32_t>; |
| 35 | using uint2 = uvec<2>; |
| 36 | using uint4 = uvec<4>; |
| 37 | |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 38 | static SK_ALWAYS_INLINE float dot(float2 a, float2 b) { |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 39 | float2 ab = a*b; |
| 40 | return ab[0] + ab[1]; |
| 41 | } |
| 42 | |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 43 | static SK_ALWAYS_INLINE float cross(float2 a, float2 b) { |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 44 | float2 x = a*skvx::shuffle<1,0>(b); |
| 45 | return x[0] - x[1]; |
| 46 | } |
| 47 | |
| 48 | // Returns f*m + a. The actual implementation may or may not be fused, depending on hardware |
| 49 | // support. We call this method "fast_madd" to draw attention to the fact that the operation may |
| 50 | // give different results on different platforms. |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 51 | template<int N> SK_ALWAYS_INLINE vec<N> fast_madd(vec<N> f, vec<N> m, vec<N> a) { |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 52 | #if FP_FAST_FMAF |
| 53 | return skvx::fma(f,m,a); |
| 54 | #else |
| 55 | return f*m + a; |
| 56 | #endif |
| 57 | } |
| 58 | |
| 59 | // Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial: |
| 60 | // |
| 61 | // acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2 |
| 62 | // |
| 63 | // See: https://stackoverflow.com/a/36387954 |
| 64 | // |
| 65 | // For a proof of max error, see the "grvx_approx_acos" unit test. |
| 66 | // |
| 67 | // NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are |
| 68 | // infinite at -1 and 1). So the input must still be clamped between -1 and 1. |
Chris Dalton | bb33be2 | 2021-02-24 16:30:34 -0700 | [diff] [blame] | 69 | #define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f) |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 70 | template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) { |
Chris Dalton | 356cef3 | 2020-12-01 02:11:24 -0700 | [diff] [blame] | 71 | constexpr static float a = -0.939115566365855f; |
| 72 | constexpr static float b = 0.9217841528914573f; |
| 73 | constexpr static float c = -1.2845906244690837f; |
| 74 | constexpr static float d = 0.295624144969963174f; |
| 75 | constexpr static float pi_over_2 = 1.5707963267948966f; |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 76 | vec<N> xx = x*x; |
Chris Dalton | 356cef3 | 2020-12-01 02:11:24 -0700 | [diff] [blame] | 77 | vec<N> numer = fast_madd<N>(b,xx,a); |
| 78 | vec<N> denom = fast_madd<N>(xx, fast_madd<N>(d,xx,c), 1); |
| 79 | return fast_madd<N>(x, numer/denom, pi_over_2); |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 80 | } |
| 81 | |
Chris Dalton | 356cef3 | 2020-12-01 02:11:24 -0700 | [diff] [blame] | 82 | // Approximates the angle between vectors a and b within .96 degrees (GRVX_FAST_ACOS_MAX_ERROR). |
| 83 | // a (and b) represent "N" (Nx2/2) 2d vectors in SIMD, with the x values found in a.lo, and the |
| 84 | // y values in a.hi. |
Chris Dalton | 6bacd9f | 2020-11-02 16:12:12 -0700 | [diff] [blame] | 85 | // |
Chris Dalton | 356cef3 | 2020-12-01 02:11:24 -0700 | [diff] [blame] | 86 | // Due to fp32 overflow, this method is only valid for magnitudes in the range (2^-31, 2^31) |
| 87 | // exclusive. Results are undefined if the inputs fall outside this range. |
Chris Dalton | 6bacd9f | 2020-11-02 16:12:12 -0700 | [diff] [blame] | 88 | // |
| 89 | // NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately. |
| 90 | // i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))". |
Chris Dalton | 356cef3 | 2020-12-01 02:11:24 -0700 | [diff] [blame] | 91 | template<int Nx2> |
| 92 | SK_ALWAYS_INLINE vec<Nx2/2> approx_angle_between_vectors(vec<Nx2> a, vec<Nx2> b) { |
| 93 | auto aa=a*a, bb=b*b, ab=a*b; |
| 94 | auto cosTheta = (ab.lo + ab.hi) / skvx::sqrt((aa.lo + aa.hi) * (bb.lo + bb.hi)); |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 95 | // Clamp cosTheta such that if it is NaN (e.g., if a or b was 0), then we return acos(1) = 0. |
| 96 | cosTheta = skvx::max(skvx::min(1, cosTheta), -1); |
| 97 | return approx_acos(cosTheta); |
| 98 | } |
| 99 | |
Chris Dalton | d461f6e | 2020-11-23 13:51:06 -0700 | [diff] [blame] | 100 | // De-interleaving load of 4 vectors. |
| 101 | // |
| 102 | // WARNING: These are really only supported well on NEON. Consider restructuring your data before |
| 103 | // resorting to these methods. |
| 104 | template<typename T> |
| 105 | SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b, |
| 106 | skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) { |
| 107 | a.val = v[0]; |
| 108 | b.val = v[1]; |
| 109 | c.val = v[2]; |
| 110 | d.val = v[3]; |
| 111 | } |
| 112 | template<int N, typename T> |
| 113 | SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type |
| 114 | strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, skvx::Vec<N,T>& c, |
| 115 | skvx::Vec<N,T>& d) { |
| 116 | strided_load4(v, a.lo, b.lo, c.lo, d.lo); |
| 117 | strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi); |
| 118 | } |
| 119 | #if !defined(SKNX_NO_SIMD) |
| 120 | #if defined(__ARM_NEON) |
| 121 | #define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \ |
| 122 | template<> \ |
| 123 | SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, \ |
| 124 | skvx::Vec<N,T>& c, skvx::Vec<N,T>& d) { \ |
| 125 | auto mat = VLD(v); \ |
| 126 | a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \ |
| 127 | b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \ |
| 128 | c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \ |
| 129 | d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \ |
| 130 | } |
| 131 | IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32); |
| 132 | IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16); |
| 133 | IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8); |
| 134 | IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32); |
| 135 | IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16); |
| 136 | IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8); |
| 137 | IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32); |
| 138 | IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32); |
| 139 | IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16); |
| 140 | IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8); |
| 141 | IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32); |
| 142 | IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16); |
| 143 | IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8); |
| 144 | IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32); |
| 145 | #undef IMPL_LOAD4_TRANSPOSED |
| 146 | #elif defined(__SSE__) |
| 147 | template<> |
| 148 | SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) { |
| 149 | using skvx::bit_pun; |
| 150 | __m128 a_ = _mm_loadu_ps(v); |
| 151 | __m128 b_ = _mm_loadu_ps(v+4); |
| 152 | __m128 c_ = _mm_loadu_ps(v+8); |
| 153 | __m128 d_ = _mm_loadu_ps(v+12); |
| 154 | _MM_TRANSPOSE4_PS(a_, b_, c_, d_); |
| 155 | a = bit_pun<float4>(a_); |
| 156 | b = bit_pun<float4>(b_); |
| 157 | c = bit_pun<float4>(c_); |
| 158 | d = bit_pun<float4>(d_); |
| 159 | } |
| 160 | #endif |
| 161 | #endif |
| 162 | |
| 163 | // De-interleaving load of 2 vectors. |
| 164 | // |
| 165 | // WARNING: These are really only supported well on NEON. Consider restructuring your data before |
| 166 | // resorting to these methods. |
| 167 | template<typename T> |
| 168 | SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) { |
| 169 | a.val = v[0]; |
| 170 | b.val = v[1]; |
| 171 | } |
| 172 | template<int N, typename T> |
| 173 | SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type |
| 174 | strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { |
| 175 | strided_load2(v, a.lo, b.lo); |
| 176 | strided_load2(v + 2*(N/2), a.hi, b.hi); |
| 177 | } |
| 178 | #if !defined(SKNX_NO_SIMD) |
| 179 | #if defined(__ARM_NEON) |
| 180 | #define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \ |
| 181 | template<> \ |
| 182 | SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \ |
| 183 | auto mat = VLD(v); \ |
| 184 | a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \ |
| 185 | b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \ |
| 186 | } |
| 187 | IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32); |
| 188 | IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16); |
| 189 | IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8); |
| 190 | IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32); |
| 191 | IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16); |
| 192 | IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8); |
| 193 | IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32); |
| 194 | IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32); |
| 195 | IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16); |
| 196 | IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8); |
| 197 | IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32); |
| 198 | IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16); |
| 199 | IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8); |
| 200 | IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32); |
| 201 | #undef IMPL_LOAD2_TRANSPOSED |
| 202 | #endif |
| 203 | #endif |
| 204 | |
Chris Dalton | c3cb099 | 2020-11-02 10:52:26 -0700 | [diff] [blame] | 205 | #if defined(__clang__) |
| 206 | #pragma STDC FP_CONTRACT DEFAULT |
| 207 | #endif |
| 208 | |
| 209 | }; // namespace grvx |
| 210 | |
| 211 | #endif |