finish up load4/store4 refactoring
I saved the easiest for last. No generated code diff for store_f32.
This just moves the platform-specific code over to SkJumper_vectors.h
Also clarify types in the existing load4()/store4() functions.
SkJumper_stages.cpp looks good to start growing again!
Change-Id: I6a8599d090b4e17663703b0c0325dbe550a6cdd8
Reviewed-on: https://skia-review.googlesource.com/11348
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index 1685da9..1b72ce7 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -41,15 +41,19 @@
SI F gather(const float* p, U32 ix) { return p[ix]; }
- SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- auto ptr = (const uint16_t*)vptr;
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
*r = ptr[0];
*g = ptr[1];
*b = ptr[2];
*a = ptr[3];
}
- SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- auto ptr = (uint16_t*)vptr;
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ ptr[0] = r;
+ ptr[1] = g;
+ ptr[2] = b;
+ ptr[3] = a;
+ }
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
ptr[0] = r;
ptr[1] = g;
ptr[2] = b;
@@ -92,16 +96,18 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ uint16x4x4_t rgba = vld4_u16(ptr);
*r = rgba.val[0];
*g = rgba.val[1];
*b = rgba.val[2];
*a = rgba.val[3];
}
- SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- uint16x4x4_t rgba = {{r,g,b,a}};
- vst4_u16((uint16_t*)ptr, rgba);
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
+ }
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
}
SI F from_half(U16 h) { return vcvt_f32_f16(h); }
@@ -139,8 +145,7 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
- SI void load4(const void* vptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
- auto ptr = (const uint16_t*)vptr;
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
uint16x4x4_t rgba;
rgba = vld4_lane_u16(ptr + 0, rgba, 0);
rgba = vld4_lane_u16(ptr + 4, rgba, 1);
@@ -149,8 +154,7 @@
*b = unaligned_load<U16>(rgba.val+2);
*a = unaligned_load<U16>(rgba.val+3);
}
- SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
- auto ptr = (uint16_t*)vptr;
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
uint16x4x4_t rgba = {{
widen_cast<uint16x4_t>(r),
widen_cast<uint16x4_t>(g),
@@ -160,6 +164,9 @@
vst4_lane_u16(ptr + 0, rgba, 0);
vst4_lane_u16(ptr + 4, rgba, 1);
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
+ }
SI F from_half(U16 h) {
auto v = widen_cast<uint16x4_t>(h);
@@ -217,7 +224,7 @@
#endif
}
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
__m128i _01, _23, _45, _67;
if (__builtin_expect(tail,0)) {
auto src = (const double*)ptr;
@@ -251,7 +258,7 @@
*b = _mm_unpacklo_epi64(ba0123, ba4567);
*a = _mm_unpackhi_epi64(ba0123, ba4567);
}
- SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3
rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7
ba0123 = _mm_unpacklo_epi16(b, a),
@@ -278,6 +285,36 @@
_mm_storeu_si128((__m128i*)ptr + 3, _67);
}
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
+ rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
+ ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
+ ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
+
+ F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
+ _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
+ _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
+ _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
+
+ if (__builtin_expect(tail, 0)) {
+ if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
+ if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
+ if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
+ if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
+ if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
+ if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
+ if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
+ } else {
+ F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
+ _23 = _mm256_permute2f128_ps(_26, _37, 32),
+ _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
+ _67 = _mm256_permute2f128_ps(_26, _37, 49);
+ _mm256_storeu_ps(ptr+ 0, _01);
+ _mm256_storeu_ps(ptr+ 8, _23);
+ _mm256_storeu_ps(ptr+16, _45);
+ _mm256_storeu_ps(ptr+24, _67);
+ }
+ }
SI F from_half(U16 h) {
#if defined(__AVX2__)
@@ -350,7 +387,7 @@
SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
- SI void load4(const void* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
+ SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
@@ -365,12 +402,19 @@
*b = unaligned_load<U16>((uint16_t*)&ba + 0);
*a = unaligned_load<U16>((uint16_t*)&ba + 4);
}
- SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+ SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
_mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
}
+ SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
+ _MM_TRANSPOSE4_PS(r,g,b,a);
+ _mm_storeu_ps(ptr+ 0, r);
+ _mm_storeu_ps(ptr+ 4, g);
+ _mm_storeu_ps(ptr+ 8, b);
+ _mm_storeu_ps(ptr+12, a);
+ }
SI F from_half(U16 h) {
auto v = widen_cast<__m128i>(h);