Implement Sk2f::Store4

Bug: skia:
Change-Id: I2adb983d68625d327e7c00e53b6ae4703b46252f
Reviewed-on: https://skia-review.googlesource.com/104761
Commit-Queue: Chris Dalton <csmartdalton@google.com>
Reviewed-by: Mike Klein <mtklein@chromium.org>
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 8d1a249..f8170ef 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -43,6 +43,16 @@
         vst3_f32((float*) dst, abc);
     }
 
+    AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
+        float32x2x4_t abcd = {{
+            a.fVec,
+            b.fVec,
+            c.fVec,
+            d.fVec,
+        }};
+        vst4_f32((float*) dst, abcd);
+    }
+
     AI SkNx invert() const {
         float32x2_t est0 = vrecpe_f32(fVec),
                     est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index dc9a594..3b530f0 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -36,6 +36,13 @@
         _mm_storel_pi(((__m64*)dst) + 2, hi);
     }
 
+    AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
+        auto lo = _mm_setr_ps(a[0], b[0], c[0], d[0]),
+             hi = _mm_setr_ps(a[1], b[1], c[1], d[1]);
+        _mm_storeu_ps((float*)dst, lo);
+        _mm_storeu_ps(((float*)dst) + 4, hi);
+    }
+
     AI SkNx operator - () const { return _mm_xor_ps(_mm_set1_ps(-0.0f), fVec); }
 
     AI SkNx operator + (const SkNx& o) const { return _mm_add_ps(fVec, o.fVec); }
diff --git a/tests/SkNxTest.cpp b/tests/SkNxTest.cpp
index 349ffb6..9e2c27e 100644
--- a/tests/SkNxTest.cpp
+++ b/tests/SkNxTest.cpp
@@ -437,3 +437,20 @@
     REPORTER_ASSERT(r, dst[4] == 4);
     REPORTER_ASSERT(r, dst[5] == 5);
 }
+
+DEF_TEST(Sk2f_Store4, r) {
+    Sk2f p0{0, 4};
+    Sk2f p1{1, 5};
+    Sk2f p2{2, 6};
+    Sk2f p3{3, 7};
+    float dst[8];
+    Sk2f::Store4(dst, p0, p1, p2, p3);
+    REPORTER_ASSERT(r, dst[0] == 0);
+    REPORTER_ASSERT(r, dst[1] == 1);
+    REPORTER_ASSERT(r, dst[2] == 2);
+    REPORTER_ASSERT(r, dst[3] == 3);
+    REPORTER_ASSERT(r, dst[4] == 4);
+    REPORTER_ASSERT(r, dst[5] == 5);
+    REPORTER_ASSERT(r, dst[6] == 6);
+    REPORTER_ASSERT(r, dst[7] == 7);
+}