Move approx_acos and strided loads from GrVx to SkVx

Change-Id: Icf2d589b7a748f98cfa1be77217f5a21aed0a1b2
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/457187
Commit-Queue: Chris Dalton <csmartdalton@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index c3dbcee..ecfcdd7 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -733,6 +733,142 @@
     }
 #endif
 
+// Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial:
+//
+//     acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
+//
+// See: https://stackoverflow.com/a/36387954
+//
+// For a proof of max error, see the "SkVx_approx_acos" unit test.
+//
+// NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are
+// infinite at -1 and 1). So the input must still be clamped between -1 and 1.
+#define SKVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
+SIN Vec<N,float> approx_acos(Vec<N,float> x) {
+    constexpr static float a = -0.939115566365855f;
+    constexpr static float b =  0.9217841528914573f;
+    constexpr static float c = -1.2845906244690837f;
+    constexpr static float d =  0.295624144969963174f;
+    constexpr static float pi_over_2 = 1.5707963267948966f;
+    auto xx = x*x;
+    auto numer = b*xx + a;
+    auto denom = xx*(d*xx + c) + 1;
+    return x * (numer/denom) + pi_over_2;
+}
+
+// De-interleaving load of 4 vectors.
+//
+// WARNING: These are really only supported well on NEON. Consider restructuring your data before
+// resorting to these methods.
+SIT void strided_load4(const T* v,
+                       skvx::Vec<1,T>& a,
+                       skvx::Vec<1,T>& b,
+                       skvx::Vec<1,T>& c,
+                       skvx::Vec<1,T>& d) {
+    a.val = v[0];
+    b.val = v[1];
+    c.val = v[2];
+    d.val = v[3];
+}
+SINT void strided_load4(const T* v,
+                        skvx::Vec<N,T>& a,
+                        skvx::Vec<N,T>& b,
+                        skvx::Vec<N,T>& c,
+                        skvx::Vec<N,T>& d) {
+    strided_load4(v, a.lo, b.lo, c.lo, d.lo);
+    strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
+}
+#if !defined(SKNX_NO_SIMD)
+#if defined(__ARM_NEON)
+#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
+SI void strided_load4(const T* v, \
+                      skvx::Vec<N,T>& a, \
+                      skvx::Vec<N,T>& b, \
+                      skvx::Vec<N,T>& c, \
+                      skvx::Vec<N,T>& d) { \
+    auto mat = VLD(v); \
+    a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
+    b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
+    c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \
+    d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \
+}
+IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32);
+IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16);
+IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8);
+IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32);
+IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16);
+IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8);
+IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32);
+IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32);
+IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16);
+IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8);
+IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32);
+IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16);
+IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8);
+IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32);
+#undef IMPL_LOAD4_TRANSPOSED
+#elif defined(__SSE__)
+SI void strided_load4(const float* v,
+                      Vec<4,float>& a,
+                      Vec<4,float>& b,
+                      Vec<4,float>& c,
+                      Vec<4,float>& d) {
+    using skvx::bit_pun;
+    __m128 a_ = _mm_loadu_ps(v);
+    __m128 b_ = _mm_loadu_ps(v+4);
+    __m128 c_ = _mm_loadu_ps(v+8);
+    __m128 d_ = _mm_loadu_ps(v+12);
+    _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
+    a = bit_pun<Vec<4,float>>(a_);
+    b = bit_pun<Vec<4,float>>(b_);
+    c = bit_pun<Vec<4,float>>(c_);
+    d = bit_pun<Vec<4,float>>(d_);
+}
+#endif
+#endif
+
+// De-interleaving load of 2 vectors.
+//
+// WARNING: These are really only supported well on NEON. Consider restructuring your data before
+// resorting to these methods.
+SIT void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) {
+    a.val = v[0];
+    b.val = v[1];
+}
+SINT void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) {
+    strided_load2(v, a.lo, b.lo);
+    strided_load2(v + 2*(N/2), a.hi, b.hi);
+}
+#if !defined(SKNX_NO_SIMD)
+#if defined(__ARM_NEON)
+#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
+SI void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \
+    auto mat = VLD(v); \
+    a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
+    b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
+}
+IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32);
+IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16);
+IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8);
+IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32);
+IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16);
+IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8);
+IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32);
+IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32);
+IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16);
+IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8);
+IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32);
+IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16);
+IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8);
+IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32);
+#undef IMPL_LOAD2_TRANSPOSED
+#endif
+#endif
+
+#if defined(__clang__)
+    #pragma STDC FP_CONTRACT DEFAULT
+#endif
+
 }  // namespace skvx
 
 #undef SINTU
diff --git a/src/gpu/GrVx.h b/src/gpu/GrVx.h
index b860124..1cfedf3 100644
--- a/src/gpu/GrVx.h
+++ b/src/gpu/GrVx.h
@@ -45,152 +45,6 @@
     return x[0] - x[1];
 }
 
-// Approximates the inverse cosine of x within 0.96 degrees using the rational polynomial:
-//
-//     acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
-//
-// See: https://stackoverflow.com/a/36387954
-//
-// For a proof of max error, see the "grvx_approx_acos" unit test.
-//
-// NOTE: This function deviates immediately from pi and 0 outside -1 and 1. (The derivatives are
-// infinite at -1 and 1). So the input must still be clamped between -1 and 1.
-#define GRVX_APPROX_ACOS_MAX_ERROR SkDegreesToRadians(.96f)
-template<int N> SK_ALWAYS_INLINE vec<N> approx_acos(vec<N> x) {
-    constexpr static float a = -0.939115566365855f;
-    constexpr static float b =  0.9217841528914573f;
-    constexpr static float c = -1.2845906244690837f;
-    constexpr static float d =  0.295624144969963174f;
-    constexpr static float pi_over_2 = 1.5707963267948966f;
-    vec<N> xx = x*x;
-    vec<N> numer = b*xx + a;
-    vec<N> denom = xx*(d*xx + c) + 1;
-    return x * (numer/denom) + pi_over_2;
-}
-
-// Approximates the angle between vectors a and b within .96 degrees (GRVX_FAST_ACOS_MAX_ERROR).
-// a (and b) represent "N" (Nx2/2) 2d vectors in SIMD, with the x values found in a.lo, and the
-// y values in a.hi.
-//
-// Due to fp32 overflow, this method is only valid for magnitudes in the range (2^-31, 2^31)
-// exclusive. Results are undefined if the inputs fall outside this range.
-//
-// NOTE: If necessary, we can extend our valid range to 2^(+/-63) by normalizing a and b separately.
-// i.e.: "cosTheta = dot(a,b) / sqrt(dot(a,a)) / sqrt(dot(b,b))".
-template<int Nx2>
-SK_ALWAYS_INLINE vec<Nx2/2> approx_angle_between_vectors(vec<Nx2> a, vec<Nx2> b) {
-    auto aa=a*a, bb=b*b, ab=a*b;
-    auto cosTheta = (ab.lo + ab.hi) / skvx::sqrt((aa.lo + aa.hi) * (bb.lo + bb.hi));
-    // Clamp cosTheta such that if it is NaN (e.g., if a or b was 0), then we return acos(1) = 0.
-    cosTheta = skvx::max(skvx::min(1, cosTheta), -1);
-    return approx_acos(cosTheta);
-}
-
-// De-interleaving load of 4 vectors.
-//
-// WARNING: These are really only supported well on NEON. Consider restructuring your data before
-// resorting to these methods.
-template<typename T>
-SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b,
-                                    skvx::Vec<1,T>& c, skvx::Vec<1,T>& d) {
-    a.val = v[0];
-    b.val = v[1];
-    c.val = v[2];
-    d.val = v[3];
-}
-template<int N, typename T>
-SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
-strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, skvx::Vec<N,T>& c,
-              skvx::Vec<N,T>& d) {
-    strided_load4(v, a.lo, b.lo, c.lo, d.lo);
-    strided_load4(v + 4*(N/2), a.hi, b.hi, c.hi, d.hi);
-}
-#if !defined(SKNX_NO_SIMD)
-#if defined(__ARM_NEON)
-#define IMPL_LOAD4_TRANSPOSED(N, T, VLD) \
-template<> \
-SK_ALWAYS_INLINE void strided_load4(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b, \
-                                    skvx::Vec<N,T>& c, skvx::Vec<N,T>& d) { \
-    auto mat = VLD(v); \
-    a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
-    b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
-    c = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[2]); \
-    d = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[3]); \
-}
-IMPL_LOAD4_TRANSPOSED(2, uint32_t, vld4_u32);
-IMPL_LOAD4_TRANSPOSED(4, uint16_t, vld4_u16);
-IMPL_LOAD4_TRANSPOSED(8, uint8_t, vld4_u8);
-IMPL_LOAD4_TRANSPOSED(2, int32_t, vld4_s32);
-IMPL_LOAD4_TRANSPOSED(4, int16_t, vld4_s16);
-IMPL_LOAD4_TRANSPOSED(8, int8_t, vld4_s8);
-IMPL_LOAD4_TRANSPOSED(2, float, vld4_f32);
-IMPL_LOAD4_TRANSPOSED(4, uint32_t, vld4q_u32);
-IMPL_LOAD4_TRANSPOSED(8, uint16_t, vld4q_u16);
-IMPL_LOAD4_TRANSPOSED(16, uint8_t, vld4q_u8);
-IMPL_LOAD4_TRANSPOSED(4, int32_t, vld4q_s32);
-IMPL_LOAD4_TRANSPOSED(8, int16_t, vld4q_s16);
-IMPL_LOAD4_TRANSPOSED(16, int8_t, vld4q_s8);
-IMPL_LOAD4_TRANSPOSED(4, float, vld4q_f32);
-#undef IMPL_LOAD4_TRANSPOSED
-#elif defined(__SSE__)
-template<>
-SK_ALWAYS_INLINE void strided_load4(const float* v, float4& a, float4& b, float4& c, float4& d) {
-    using skvx::bit_pun;
-    __m128 a_ = _mm_loadu_ps(v);
-    __m128 b_ = _mm_loadu_ps(v+4);
-    __m128 c_ = _mm_loadu_ps(v+8);
-    __m128 d_ = _mm_loadu_ps(v+12);
-    _MM_TRANSPOSE4_PS(a_, b_, c_, d_);
-    a = bit_pun<float4>(a_);
-    b = bit_pun<float4>(b_);
-    c = bit_pun<float4>(c_);
-    d = bit_pun<float4>(d_);
-}
-#endif
-#endif
-
-// De-interleaving load of 2 vectors.
-//
-// WARNING: These are really only supported well on NEON. Consider restructuring your data before
-// resorting to these methods.
-template<typename T>
-SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<1,T>& a, skvx::Vec<1,T>& b) {
-    a.val = v[0];
-    b.val = v[1];
-}
-template<int N, typename T>
-SK_ALWAYS_INLINE typename std::enable_if<N >= 2, void>::type
-strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) {
-    strided_load2(v, a.lo, b.lo);
-    strided_load2(v + 2*(N/2), a.hi, b.hi);
-}
-#if !defined(SKNX_NO_SIMD)
-#if defined(__ARM_NEON)
-#define IMPL_LOAD2_TRANSPOSED(N, T, VLD) \
-template<> \
-SK_ALWAYS_INLINE void strided_load2(const T* v, skvx::Vec<N,T>& a, skvx::Vec<N,T>& b) { \
-    auto mat = VLD(v); \
-    a = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[0]); \
-    b = skvx::bit_pun<skvx::Vec<N,T>>(mat.val[1]); \
-}
-IMPL_LOAD2_TRANSPOSED(2, uint32_t, vld2_u32);
-IMPL_LOAD2_TRANSPOSED(4, uint16_t, vld2_u16);
-IMPL_LOAD2_TRANSPOSED(8, uint8_t, vld2_u8);
-IMPL_LOAD2_TRANSPOSED(2, int32_t, vld2_s32);
-IMPL_LOAD2_TRANSPOSED(4, int16_t, vld2_s16);
-IMPL_LOAD2_TRANSPOSED(8, int8_t, vld2_s8);
-IMPL_LOAD2_TRANSPOSED(2, float, vld2_f32);
-IMPL_LOAD2_TRANSPOSED(4, uint32_t, vld2q_u32);
-IMPL_LOAD2_TRANSPOSED(8, uint16_t, vld2q_u16);
-IMPL_LOAD2_TRANSPOSED(16, uint8_t, vld2q_u8);
-IMPL_LOAD2_TRANSPOSED(4, int32_t, vld2q_s32);
-IMPL_LOAD2_TRANSPOSED(8, int16_t, vld2q_s16);
-IMPL_LOAD2_TRANSPOSED(16, int8_t, vld2q_s8);
-IMPL_LOAD2_TRANSPOSED(4, float, vld2q_f32);
-#undef IMPL_LOAD2_TRANSPOSED
-#endif
-#endif
-
 #if defined(__clang__)
     #pragma STDC FP_CONTRACT DEFAULT
 #endif
diff --git a/src/gpu/ops/FillRRectOp.cpp b/src/gpu/ops/FillRRectOp.cpp
index 613f7f7..4411f74 100644
--- a/src/gpu/ops/FillRRectOp.cpp
+++ b/src/gpu/ops/FillRRectOp.cpp
@@ -532,7 +532,7 @@
 
             // Convert the radii to [-1, -1, +1, +1] space and write their attribs.
             grvx::float4 radiiX, radiiY;
-            grvx::strided_load2(&SkRRectPriv::GetRadiiArray(i->fRRect)->fX, radiiX, radiiY);
+            skvx::strided_load2(&SkRRectPriv::GetRadiiArray(i->fRRect)->fX, radiiX, radiiY);
             radiiX *= 2 / (r - l);
             radiiY *= 2 / (b - t);
 
diff --git a/src/gpu/tessellate/GrStrokeTessellator.h b/src/gpu/tessellate/GrStrokeTessellator.h
index b484df8..0a4b865 100644
--- a/src/gpu/tessellate/GrStrokeTessellator.h
+++ b/src/gpu/tessellate/GrStrokeTessellator.h
@@ -79,8 +79,8 @@
     template<int N> static grvx::vec<N> ApproxNumRadialSegmentsPerRadian(
             float parametricPrecision, grvx::vec<N> strokeWidths) {
         grvx::vec<N> cosTheta = skvx::max(1 - 2 / (parametricPrecision * strokeWidths), -1);
-        // Subtract GRVX_APPROX_ACOS_MAX_ERROR so we never account for too few segments.
-        return .5f / (grvx::approx_acos(cosTheta) - GRVX_APPROX_ACOS_MAX_ERROR);
+        // Subtract SKVX_APPROX_ACOS_MAX_ERROR so we never account for too few segments.
+        return .5f / (skvx::approx_acos(cosTheta) - SKVX_APPROX_ACOS_MAX_ERROR);
     }
     // Returns the equivalent stroke width in (pre-viewMatrix) local path space that the
     // tessellator will use when rendering this stroke. This only differs from the actual stroke
diff --git a/tests/GrVxTest.cpp b/tests/GrVxTest.cpp
index 829fcef..39803a9 100644
--- a/tests/GrVxTest.cpp
+++ b/tests/GrVxTest.cpp
@@ -39,233 +39,3 @@
                 grvx::dot({a,b}, {c,d}), SkPoint::DotProduct({a,b}, {c,d}), kTolerance));
     }
 }
-
-static bool check_approx_acos(skiatest::Reporter* r, float x, float approx_acos_x) {
-    float acosf_x = acosf(x);
-    float error = acosf_x - approx_acos_x;
-    if (!(fabsf(error) <= GRVX_APPROX_ACOS_MAX_ERROR)) {
-        ERRORF(r, "Larger-than-expected error from grvx::approx_acos\n"
-                  "  x=              %f\n"
-                  "  approx_acos_x=  %f  (%f degrees\n"
-                  "  acosf_x=        %f  (%f degrees\n"
-                  "  error=          %f  (%f degrees)\n"
-                  "  tolerance=      %f  (%f degrees)\n\n",
-                  x, approx_acos_x, SkRadiansToDegrees(approx_acos_x), acosf_x,
-                  SkRadiansToDegrees(acosf_x), error, SkRadiansToDegrees(error),
-                  GRVX_APPROX_ACOS_MAX_ERROR, SkRadiansToDegrees(GRVX_APPROX_ACOS_MAX_ERROR));
-        return false;
-    }
-    return true;
-}
-
-DEF_TEST(grvx_approx_acos, r) {
-    float4 boundaries = approx_acos(float4{-1, 0, 1, 0});
-    check_approx_acos(r, -1, boundaries[0]);
-    check_approx_acos(r, 0, boundaries[1]);
-    check_approx_acos(r, +1, boundaries[2]);
-
-    // Select a distribution of starting points around which to begin testing approx_acos. These
-    // fall roughly around the known minimum and maximum errors. No need to include -1, 0, or 1
-    // since those were just tested above. (Those are tricky because 0 is an inflection and the
-    // derivative is infinite at 1 and -1.)
-    constexpr static int N = 8;
-    vec<8> x = {-.99f, -.8f, -.4f, -.2f, .2f, .4f, .8f, .99f};
-
-    // Converge at the various local minima and maxima of "approx_acos(x) - cosf(x)" and verify that
-    // approx_acos is always within "kTolerance" degrees of the expected answer.
-    vec<N> err_;
-    for (int iter = 0; iter < 10; ++iter) {
-        // Run our approximate inverse cosine approximation.
-        vec<N> approx_acos_x = approx_acos(x);
-
-        // Find d/dx(error)
-        //    = d/dx(approx_acos(x) - acos(x))
-        //    = (f'g - fg')/gg + 1/sqrt(1 - x^2), [where f = bx^3 + ax, g = dx^4 + cx^2 + 1]
-        vec<N> xx = x*x;
-        vec<N> a = -0.939115566365855f;
-        vec<N> b =  0.9217841528914573f;
-        vec<N> c = -1.2845906244690837f;
-        vec<N> d =  0.295624144969963174f;
-        vec<N> f = (b*xx + a)*x;
-        vec<N> f_ = 3*b*xx + a;
-        vec<N> g = (d*xx + c)*xx + 1;
-        vec<N> g_ = (4*d*xx + 2*c)*x;
-        vec<N> gg = g*g;
-        vec<N> q = skvx::sqrt(1 - xx);
-        err_ = (f_*g - f*g_)/gg + 1/q;
-
-        // Find d^2/dx^2(error)
-        //    = ((f''g - fg'')g^2 - (f'g - fg')2gg') / g^4 + x(1 - x^2)^(-3/2)
-        //    = ((f''g - fg'')g - (f'g - fg')2g') / g^3 + x(1 - x^2)^(-3/2)
-        vec<N> f__ = 6*b*x;
-        vec<N> g__ = 12*d*xx + 2*c;
-        vec<N> err__ = ((f__*g - f*g__)*g - (f_*g - f*g_)*2*g_) / (gg*g) + x/((1 - xx)*q);
-
-#if 0
-        SkDebugf("\n\niter %i\n", iter);
-#endif
-        // Ensure each lane's approximation is within maximum error.
-        for (int j = 0; j < N; ++j) {
-#if 0
-            SkDebugf("x=%f  err=%f  err'=%f  err''=%f\n",
-                     x[j], SkRadiansToDegrees(approx_acos_x[j] - acosf(x[j])),
-                     SkRadiansToDegrees(err_[j]), SkRadiansToDegrees(err__[j]));
-#endif
-            if (!check_approx_acos(r, x[j], approx_acos_x[j])) {
-                return;
-            }
-        }
-
-        // Use Newton's method to update the x values to locations closer to their local minimum or
-        // maximum. (This is where d/dx(error) == 0.)
-        x -= err_/err__;
-        x = skvx::pin(x, vec<N>(-.99f), vec<N>(.99f));
-    }
-
-    // Ensure each lane converged to a local minimum or maximum.
-    for (int j = 0; j < N; ++j) {
-        REPORTER_ASSERT(r, SkScalarNearlyZero(err_[j]));
-    }
-
-    // Make sure we found all the actual known locations of local min/max error.
-    for (float knownRoot : {-0.983536f, -0.867381f, -0.410923f, 0.410923f, 0.867381f, 0.983536f}) {
-        REPORTER_ASSERT(r, skvx::any(skvx::abs(x - knownRoot) < SK_ScalarNearlyZero));
-    }
-}
-
-static float precise_angle_between_vectors(SkPoint a, SkPoint b) {
-    if (a.isZero() || b.isZero()) {
-        return 0;
-    }
-    double ax=a.fX, ay=a.fY, bx=b.fX, by=b.fY;
-    double theta = (ax*bx + ay*by) / sqrt(ax*ax + ay*ay) / sqrt(bx*bx + by*by);
-    return (float)acos(theta);
-}
-
-static bool check_approx_angle_between_vectors(skiatest::Reporter* r, SkVector a, SkVector b,
-                                               float approxTheta) {
-    float expectedTheta = precise_angle_between_vectors(a, b);
-    float error = expectedTheta - approxTheta;
-    if (!(fabsf(error) <= GRVX_APPROX_ACOS_MAX_ERROR + SK_ScalarNearlyZero)) {
-        int expAx = SkFloat2Bits(a.fX) >> 23 & 0xff;
-        int expAy = SkFloat2Bits(a.fY) >> 23 & 0xff;
-        int expBx = SkFloat2Bits(b.fX) >> 23 & 0xff;
-        int expBy = SkFloat2Bits(b.fY) >> 23 & 0xff;
-        ERRORF(r, "Larger-than-expected error from grvx::approx_angle_between_vectors\n"
-                  "  a=                {%f, %f}\n"
-                  "  b=                {%f, %f}\n"
-                  "  expA=             {%u, %u}\n"
-                  "  expB=             {%u, %u}\n"
-                  "  approxTheta=      %f  (%f degrees\n"
-                  "  expectedTheta=     %f  (%f degrees)\n"
-                  "  error=             %f  (%f degrees)\n"
-                  "  tolerance=         %f  (%f degrees)\n\n",
-                  a.fX, a.fY, b.fX, b.fY, expAx, expAy, expBx, expBy, approxTheta,
-                  SkRadiansToDegrees(approxTheta), expectedTheta, SkRadiansToDegrees(expectedTheta),
-                  error, SkRadiansToDegrees(error), GRVX_APPROX_ACOS_MAX_ERROR,
-                  SkRadiansToDegrees(GRVX_APPROX_ACOS_MAX_ERROR));
-        return false;
-    }
-    return true;
-}
-
-static bool check_approx_angle_between_vectors(skiatest::Reporter* r, SkVector a, SkVector b) {
-    float approxTheta = grvx::approx_angle_between_vectors(bit_pun<float2>(a),
-                                                           bit_pun<float2>(b)).val;
-    return check_approx_angle_between_vectors(r, a, b, approxTheta);
-}
-
-DEF_TEST(grvx_approx_angle_between_vectors, r) {
-    // Test when a and/or b are zero.
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>({0,0}, {0,0}).val));
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>({1,1}, {0,0}).val));
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>({0,0}, {1,1}).val));
-    check_approx_angle_between_vectors(r, {0,0}, {0,0});
-    check_approx_angle_between_vectors(r, {1,1}, {0,0});
-    check_approx_angle_between_vectors(r, {0,0}, {1,1});
-
-    // Test infinities.
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>(
-            {std::numeric_limits<float>::infinity(),1}, {2,3}).val));
-
-    // Test NaNs.
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>(
-            {std::numeric_limits<float>::quiet_NaN(),1}, {2,3}).val));
-
-    // Test demorms.
-    float epsilon = std::numeric_limits<float>::denorm_min();
-    REPORTER_ASSERT(r, SkScalarNearlyZero(grvx::approx_angle_between_vectors<2>(
-            {epsilon, epsilon}, {epsilon, epsilon}).val));
-
-    // Test random floats of all types.
-    uint4 mantissas = {0,0,0,0};
-    uint4 exp = uint4{126, 127, 128, 129};
-    for (uint32_t i = 0; i < (1 << 12); ++i) {
-        // approx_angle_between_vectors is only valid for absolute values < 2^31.
-        uint4 exp_ = skvx::min(exp, 127 + 30);
-        uint32_t a=exp_[0], b=exp_[1], c=exp_[2], d=exp_[3];
-        // approx_angle_between_vectors is only valid if at least one vector component's magnitude
-        // is >2^-31.
-        a = std::max(a, 127u - 30);
-        c = std::max(a, 127u - 30);
-        // Run two tests where both components of both vectors have the same exponent, one where
-        // both components of a given vector have the same exponent, and one where all components of
-        // all vectors have different exponents.
-        uint4 x0exp = uint4{a,c,a,a} << 23;
-        uint4 y0exp = uint4{a,c,a,b} << 23;
-        uint4 x1exp = uint4{a,c,c,c} << 23;
-        uint4 y1exp = uint4{a,c,c,d} << 23;
-        uint4 signs = uint4{i<<31, i<<30, i<<29, i<<28} & (1u<<31);
-        float4 x0 = bit_pun<float4>(signs | x0exp | mantissas[0]);
-        float4 y0 = bit_pun<float4>(signs | y0exp | mantissas[1]);
-        float4 x1 = bit_pun<float4>(signs | x1exp | mantissas[2]);
-        float4 y1 = bit_pun<float4>(signs | y1exp | mantissas[3]);
-        float4 rads = approx_angle_between_vectors(skvx::join(x0, y0), skvx::join(x1, y1));
-        for (int j = 0; j < 4; ++j) {
-            if (!check_approx_angle_between_vectors(r, {x0[j], y0[j]}, {x1[j], y1[j]}, rads[j])) {
-                return;
-            }
-        }
-        // Adding primes makes sure we test every value before we repeat.
-        mantissas = (mantissas + uint4{123456791, 201345691, 198765433, 156789029}) & ((1<<23) - 1);
-        exp = (exp + uint4{79, 83, 199, 7}) & 0xff;
-    }
-}
-
-template<int N, typename T> void check_strided_loads(skiatest::Reporter* r) {
-    using Vec = skvx::Vec<N,T>;
-    T values[N*4];
-    std::iota(values, values + N*4, 0);
-    Vec a, b, c, d;
-    grvx::strided_load2(values, a, b);
-    for (int i = 0; i < N; ++i) {
-        REPORTER_ASSERT(r, a[i] == values[i*2]);
-        REPORTER_ASSERT(r, b[i] == values[i*2 + 1]);
-    }
-    grvx::strided_load4(values, a, b, c, d);
-    for (int i = 0; i < N; ++i) {
-        REPORTER_ASSERT(r, a[i] == values[i*4]);
-        REPORTER_ASSERT(r, b[i] == values[i*4 + 1]);
-        REPORTER_ASSERT(r, c[i] == values[i*4 + 2]);
-        REPORTER_ASSERT(r, d[i] == values[i*4 + 3]);
-    }
-}
-
-template<typename T> void check_strided_loads(skiatest::Reporter* r) {
-    check_strided_loads<1,T>(r);
-    check_strided_loads<2,T>(r);
-    check_strided_loads<4,T>(r);
-    check_strided_loads<8,T>(r);
-    check_strided_loads<16,T>(r);
-    check_strided_loads<32,T>(r);
-}
-
-DEF_TEST(GrVx_strided_loads, r) {
-    check_strided_loads<uint32_t>(r);
-    check_strided_loads<uint16_t>(r);
-    check_strided_loads<uint8_t>(r);
-    check_strided_loads<int32_t>(r);
-    check_strided_loads<int16_t>(r);
-    check_strided_loads<int8_t>(r);
-    check_strided_loads<float>(r);
-}
diff --git a/tests/SkVxTest.cpp b/tests/SkVxTest.cpp
index 8291934..3177366 100644
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@@ -10,6 +10,7 @@
 
 #include "include/private/SkVx.h"
 #include "tests/Test.h"
+#include <numeric>
 
 using float2 = skvx::Vec<2,float>;
 using float4 = skvx::Vec<4,float>;
@@ -28,6 +29,10 @@
 using int4 = skvx::Vec<4,int32_t>;
 using int8 = skvx::Vec<8,int32_t>;
 
+using uint2 = skvx::Vec<2,uint32_t>;
+using uint4 = skvx::Vec<4,uint32_t>;
+using uint8 = skvx::Vec<8,uint32_t>;
+
 using long2 = skvx::Vec<2,int64_t>;
 using long4 = skvx::Vec<4,int64_t>;
 using long8 = skvx::Vec<8,int64_t>;
@@ -254,3 +259,133 @@
                                      float2(1).xyxy(),
                                      float2(2).xyxy()) == float4(1,1,2,2)));
 }
+
+static bool check_approx_acos(skiatest::Reporter* r, float x, float approx_acos_x) {
+    float acosf_x = acosf(x);
+    float error = acosf_x - approx_acos_x;
+    if (!(fabsf(error) <= SKVX_APPROX_ACOS_MAX_ERROR)) {
+        ERRORF(r, "Larger-than-expected error from skvx::approx_acos\n"
+                  "  x=              %f\n"
+                  "  approx_acos_x=  %f  (%f degrees\n"
+                  "  acosf_x=        %f  (%f degrees\n"
+                  "  error=          %f  (%f degrees)\n"
+                  "  tolerance=      %f  (%f degrees)\n\n",
+                  x, approx_acos_x, SkRadiansToDegrees(approx_acos_x), acosf_x,
+                  SkRadiansToDegrees(acosf_x), error, SkRadiansToDegrees(error),
+                  SKVX_APPROX_ACOS_MAX_ERROR, SkRadiansToDegrees(SKVX_APPROX_ACOS_MAX_ERROR));
+        return false;
+    }
+    return true;
+}
+
+DEF_TEST(SkVx_approx_acos, r) {
+    float4 boundaries = skvx::approx_acos(float4{-1, 0, 1, 0});
+    check_approx_acos(r, -1, boundaries[0]);
+    check_approx_acos(r, 0, boundaries[1]);
+    check_approx_acos(r, +1, boundaries[2]);
+
+    // Select a distribution of starting points around which to begin testing approx_acos. These
+    // fall roughly around the known minimum and maximum errors. No need to include -1, 0, or 1
+    // since those were just tested above. (Those are tricky because 0 is an inflection and the
+    // derivative is infinite at 1 and -1.)
+    float8 x = {-.99f, -.8f, -.4f, -.2f, .2f, .4f, .8f, .99f};
+
+    // Converge at the various local minima and maxima of "approx_acos(x) - cosf(x)" and verify that
+    // approx_acos is always within "kTolerance" degrees of the expected answer.
+    float8 err_;
+    for (int iter = 0; iter < 10; ++iter) {
+        // Run our approximate inverse cosine approximation.
+        auto approx_acos_x = skvx::approx_acos(x);
+
+        // Find d/dx(error)
+        //    = d/dx(approx_acos(x) - acos(x))
+        //    = (f'g - fg')/gg + 1/sqrt(1 - x^2), [where f = bx^3 + ax, g = dx^4 + cx^2 + 1]
+        float8 xx = x*x;
+        float8 a = -0.939115566365855f;
+        float8 b =  0.9217841528914573f;
+        float8 c = -1.2845906244690837f;
+        float8 d =  0.295624144969963174f;
+        float8 f = (b*xx + a)*x;
+        float8 f_ = 3*b*xx + a;
+        float8 g = (d*xx + c)*xx + 1;
+        float8 g_ = (4*d*xx + 2*c)*x;
+        float8 gg = g*g;
+        float8 q = skvx::sqrt(1 - xx);
+        err_ = (f_*g - f*g_)/gg + 1/q;
+
+        // Find d^2/dx^2(error)
+        //    = ((f''g - fg'')g^2 - (f'g - fg')2gg') / g^4 + x(1 - x^2)^(-3/2)
+        //    = ((f''g - fg'')g - (f'g - fg')2g') / g^3 + x(1 - x^2)^(-3/2)
+        float8 f__ = 6*b*x;
+        float8 g__ = 12*d*xx + 2*c;
+        float8 err__ = ((f__*g - f*g__)*g - (f_*g - f*g_)*2*g_) / (gg*g) + x/((1 - xx)*q);
+
+#if 0
+        SkDebugf("\n\niter %i\n", iter);
+#endif
+        // Ensure each lane's approximation is within maximum error.
+        for (int j = 0; j < 8; ++j) {
+#if 0
+            SkDebugf("x=%f  err=%f  err'=%f  err''=%f\n",
+                     x[j], SkRadiansToDegrees(skvx::approx_acos_x[j] - acosf(x[j])),
+                     SkRadiansToDegrees(err_[j]), SkRadiansToDegrees(err__[j]));
+#endif
+            if (!check_approx_acos(r, x[j], approx_acos_x[j])) {
+                return;
+            }
+        }
+
+        // Use Newton's method to update the x values to locations closer to their local minimum or
+        // maximum. (This is where d/dx(error) == 0.)
+        x -= err_/err__;
+        x = skvx::pin<8,float>(x, -.99f, .99f);
+    }
+
+    // Ensure each lane converged to a local minimum or maximum.
+    for (int j = 0; j < 8; ++j) {
+        REPORTER_ASSERT(r, SkScalarNearlyZero(err_[j]));
+    }
+
+    // Make sure we found all the actual known locations of local min/max error.
+    for (float knownRoot : {-0.983536f, -0.867381f, -0.410923f, 0.410923f, 0.867381f, 0.983536f}) {
+        REPORTER_ASSERT(r, skvx::any(skvx::abs(x - knownRoot) < SK_ScalarNearlyZero));
+    }
+}
+
+template<int N, typename T> void check_strided_loads(skiatest::Reporter* r) {
+    using Vec = skvx::Vec<N,T>;
+    T values[N*4];
+    std::iota(values, values + N*4, 0);
+    Vec a, b, c, d;
+    skvx::strided_load2(values, a, b);
+    for (int i = 0; i < N; ++i) {
+        REPORTER_ASSERT(r, a[i] == values[i*2]);
+        REPORTER_ASSERT(r, b[i] == values[i*2 + 1]);
+    }
+    skvx::strided_load4(values, a, b, c, d);
+    for (int i = 0; i < N; ++i) {
+        REPORTER_ASSERT(r, a[i] == values[i*4]);
+        REPORTER_ASSERT(r, b[i] == values[i*4 + 1]);
+        REPORTER_ASSERT(r, c[i] == values[i*4 + 2]);
+        REPORTER_ASSERT(r, d[i] == values[i*4 + 3]);
+    }
+}
+
+template<typename T> void check_strided_loads(skiatest::Reporter* r) {
+    check_strided_loads<1,T>(r);
+    check_strided_loads<2,T>(r);
+    check_strided_loads<4,T>(r);
+    check_strided_loads<8,T>(r);
+    check_strided_loads<16,T>(r);
+    check_strided_loads<32,T>(r);
+}
+
+DEF_TEST(SkVx_strided_loads, r) {
+    check_strided_loads<uint32_t>(r);
+    check_strided_loads<uint16_t>(r);
+    check_strided_loads<uint8_t>(r);
+    check_strided_loads<int32_t>(r);
+    check_strided_loads<int16_t>(r);
+    check_strided_loads<int8_t>(r);
+    check_strided_loads<float>(r);
+}