finish up load4/store4 refactoring

I saved the easiest for last.  No generated code diff for store_f32.
This just moves the platform-specific code over to SkJumper_vectors.h

Also clarify types in the existing load4()/store4() functions.

SkJumper_stages.cpp looks good to start growing again!

Change-Id: I6a8599d090b4e17663703b0c0325dbe550a6cdd8
Reviewed-on: https://skia-review.googlesource.com/11348
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index 1685da9..1b72ce7 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -41,15 +41,19 @@
 
     SI F gather(const float* p, U32 ix) { return p[ix]; }
 
-    SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
-        auto ptr = (const uint16_t*)vptr;
+    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
         *r = ptr[0];
         *g = ptr[1];
         *b = ptr[2];
         *a = ptr[3];
     }
-    SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
-        auto ptr = (uint16_t*)vptr;
+    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        ptr[0] = r;
+        ptr[1] = g;
+        ptr[2] = b;
+        ptr[3] = a;
+    }
+    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         ptr[0] = r;
         ptr[1] = g;
         ptr[2] = b;
@@ -92,16 +96,18 @@
 
     SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 
-    SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
-        uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
+    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+        uint16x4x4_t rgba = vld4_u16(ptr);
         *r = rgba.val[0];
         *g = rgba.val[1];
         *b = rgba.val[2];
         *a = rgba.val[3];
     }
-    SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
-        uint16x4x4_t rgba = {{r,g,b,a}};
-        vst4_u16((uint16_t*)ptr, rgba);
+    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
+    }
+    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+        vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
 
     SI F from_half(U16 h) { return vcvt_f32_f16(h); }
@@ -139,8 +145,7 @@
 
     SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
 
-    SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
-        auto ptr = (const uint16_t*)vptr;
+    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
         uint16x4x4_t rgba;
         rgba = vld4_lane_u16(ptr + 0, rgba, 0);
         rgba = vld4_lane_u16(ptr + 4, rgba, 1);
@@ -149,8 +154,7 @@
         *b = unaligned_load<U16>(rgba.val+2);
         *a = unaligned_load<U16>(rgba.val+3);
     }
-    SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
-        auto ptr = (uint16_t*)vptr;
+    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
         uint16x4x4_t rgba = {{
             widen_cast<uint16x4_t>(r),
             widen_cast<uint16x4_t>(g),
@@ -160,6 +164,9 @@
         vst4_lane_u16(ptr + 0, rgba, 0);
         vst4_lane_u16(ptr + 4, rgba, 1);
     }
+    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+        vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
+    }
 
     SI F from_half(U16 h) {
         auto v = widen_cast<uint16x4_t>(h);
@@ -217,7 +224,7 @@
     #endif
     }
 
-    SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
         __m128i _01, _23, _45, _67;
         if (__builtin_expect(tail,0)) {
             auto src = (const double*)ptr;
@@ -251,7 +258,7 @@
         *b = _mm_unpacklo_epi64(ba0123, ba4567);
         *a = _mm_unpackhi_epi64(ba0123, ba4567);
     }
-    SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
         auto rg0123 = _mm_unpacklo_epi16(r, g),  // r0 g0 r1 g1 r2 g2 r3 g3
              rg4567 = _mm_unpackhi_epi16(r, g),  // r4 g4 r5 g5 r6 g6 r7 g7
              ba0123 = _mm_unpacklo_epi16(b, a),
@@ -278,6 +285,36 @@
             _mm_storeu_si128((__m128i*)ptr + 3, _67);
         }
     }
+    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+        F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5
+          rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ...
+          ba0145 = _mm256_unpacklo_ps(b, a),  // b0 a0 b1 a1 | b4 a4 b5 a5
+          ba2367 = _mm256_unpackhi_ps(b, a);  // b2 ...      | b6 ...
+
+        F _04 = _mm256_unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4
+          _15 = _mm256_unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ...
+          _26 = _mm256_unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ...
+          _37 = _mm256_unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ...
+
+        if (__builtin_expect(tail, 0)) {
+            if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
+            if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
+            if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
+            if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
+            if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
+            if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
+            if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
+        } else {
+            F _01 = _mm256_permute2f128_ps(_04, _15, 32),  // 32 == 0010 0000 == lo, lo
+              _23 = _mm256_permute2f128_ps(_26, _37, 32),
+              _45 = _mm256_permute2f128_ps(_04, _15, 49),  // 49 == 0011 0001 == hi, hi
+              _67 = _mm256_permute2f128_ps(_26, _37, 49);
+            _mm256_storeu_ps(ptr+ 0, _01);
+            _mm256_storeu_ps(ptr+ 8, _23);
+            _mm256_storeu_ps(ptr+16, _45);
+            _mm256_storeu_ps(ptr+24, _67);
+        }
+    }
 
     SI F from_half(U16 h) {
     #if defined(__AVX2__)
@@ -350,7 +387,7 @@
 
     SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 
-    SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+    SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
         auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
              _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
 
@@ -365,12 +402,19 @@
         *b = unaligned_load<U16>((uint16_t*)&ba + 0);
         *a = unaligned_load<U16>((uint16_t*)&ba + 4);
     }
-    SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+    SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
         auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
              ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
         _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
         _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
     }
+    SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+        _MM_TRANSPOSE4_PS(r,g,b,a);
+        _mm_storeu_ps(ptr+ 0, r);
+        _mm_storeu_ps(ptr+ 4, g);
+        _mm_storeu_ps(ptr+ 8, b);
+        _mm_storeu_ps(ptr+12, a);
+    }
 
     SI F from_half(U16 h) {
         auto v = widen_cast<__m128i>(h);