jumper, add load_f32()

Change-Id: I71d85ffe29bc11678ff1e696fa4a2c93d0b4fcbe
Reviewed-on: https://skia-review.googlesource.com/11446
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index 4ad2afd..e4836aa 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -99,6 +99,7 @@
     M(store_f16)          \
     M(load_u16_be)        \
     M(store_u16_be)       \
+    M(load_f32)           \
     M(store_f32)          \
     M(luminance_to_alpha) \
     M(matrix_2x3)         \
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 75e1ad2..cff0904 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -1678,6 +1678,15 @@
   .long  0x91004021                          // add           x1, x1, #0x10
   .long  0xd61f0060                          // br            x3
 
+HIDDEN _sk_load_f32_aarch64
+.globl _sk_load_f32_aarch64
+_sk_load_f32_aarch64:
+  .long  0xa8c10c28                          // ldp           x8, x3, [x1], #16
+  .long  0xf9400108                          // ldr           x8, [x8]
+  .long  0x8b001108                          // add           x8, x8, x0, lsl #4
+  .long  0x4c400900                          // ld4           {v0.4s-v3.4s}, [x8]
+  .long  0xd61f0060                          // br            x3
+
 HIDDEN _sk_store_f32_aarch64
 .globl _sk_store_f32_aarch64
 _sk_store_f32_aarch64:
@@ -3760,6 +3769,16 @@
   .long  0x477fff00                          // .word         0x477fff00
   .long  0x477fff00                          // .word         0x477fff00
 
+HIDDEN _sk_load_f32_vfp4
+.globl _sk_load_f32_vfp4
+_sk_load_f32_vfp4:
+  .long  0xe8911008                          // ldm           r1, {r3, ip}
+  .long  0xe2811008                          // add           r1, r1, #8
+  .long  0xe5933000                          // ldr           r3, [r3]
+  .long  0xe0833200                          // add           r3, r3, r0, lsl #4
+  .long  0xf423008f                          // vld4.32       {d0-d3}, [r3]
+  .long  0xe12fff1c                          // bx            ip
+
 HIDDEN _sk_store_f32_vfp4
 .globl _sk_store_f32_vfp4
 _sk_store_f32_vfp4:
@@ -5768,7 +5787,7 @@
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  233,255,255,255,225                 // jmpq          ffffffffe2001650 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff563>
+  .byte  233,255,255,255,225                 // jmpq          ffffffffe2001650 <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4a4>
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -6190,6 +6209,56 @@
   .byte  196,65,121,214,68,248,48            // vmovq         %xmm8,0x30(%r8,%rdi,8)
   .byte  235,174                             // jmp           1c57 <_sk_store_u16_be_hsw+0xef>
 
+HIDDEN _sk_load_f32_hsw
+.globl _sk_load_f32_hsw
+_sk_load_f32_hsw:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,249,7                        // cmp           $0x7,%rcx
+  .byte  119,110                             // ja            1d1f <_sk_load_f32_hsw+0x76>
+  .byte  76,139,0                            // mov           (%rax),%r8
+  .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
+  .byte  76,141,21,133,0,0,0                 // lea           0x85(%rip),%r10        # 1d48 <_sk_load_f32_hsw+0x9f>
+  .byte  73,99,4,138                         // movslq        (%r10,%rcx,4),%rax
+  .byte  76,1,208                            // add           %r10,%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  196,3,125,24,68,136,112,1           // vinsertf128   $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
+  .byte  196,131,125,24,92,136,96,1          // vinsertf128   $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
+  .byte  196,131,125,24,76,136,80,1          // vinsertf128   $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
+  .byte  196,131,125,24,84,136,64,1          // vinsertf128   $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
+  .byte  196,129,121,16,68,136,48            // vmovupd       0x30(%r8,%r9,4),%xmm0
+  .byte  196,195,125,13,192,12               // vblendpd      $0xc,%ymm8,%ymm0,%ymm0
+  .byte  196,1,121,16,68,136,32              // vmovupd       0x20(%r8,%r9,4),%xmm8
+  .byte  196,99,61,13,203,12                 // vblendpd      $0xc,%ymm3,%ymm8,%ymm9
+  .byte  196,129,121,16,92,136,16            // vmovupd       0x10(%r8,%r9,4),%xmm3
+  .byte  196,99,101,13,209,12                // vblendpd      $0xc,%ymm1,%ymm3,%ymm10
+  .byte  196,129,121,16,12,136               // vmovupd       (%r8,%r9,4),%xmm1
+  .byte  196,227,117,13,202,12               // vblendpd      $0xc,%ymm2,%ymm1,%ymm1
+  .byte  196,193,116,20,210                  // vunpcklps     %ymm10,%ymm1,%ymm2
+  .byte  196,193,116,21,218                  // vunpckhps     %ymm10,%ymm1,%ymm3
+  .byte  197,180,20,200                      // vunpcklps     %ymm0,%ymm9,%ymm1
+  .byte  197,52,21,192                       // vunpckhps     %ymm0,%ymm9,%ymm8
+  .byte  197,237,20,193                      // vunpcklpd     %ymm1,%ymm2,%ymm0
+  .byte  197,237,21,201                      // vunpckhpd     %ymm1,%ymm2,%ymm1
+  .byte  196,193,101,20,208                  // vunpcklpd     %ymm8,%ymm3,%ymm2
+  .byte  196,193,101,21,216                  // vunpckhpd     %ymm8,%ymm3,%ymm3
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  144                                 // nop
+  .byte  132,255                             // test          %bh,%bh
+  .byte  255                                 // (bad)
+  .byte  255,203                             // dec           %ebx
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  190,255,255,255,177                 // mov           $0xb1ffffff,%esi
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,164,255,255,255,156,255         // jmpq          *-0x630001(%rdi,%rdi,8)
+  .byte  255                                 // (bad)
+  .byte  255,148,255,255,255,140,255         // callq         *-0x730001(%rdi,%rdi,8)
+  .byte  255                                 // (bad)
+  .byte  255                                 // .byte         0xff
+
 HIDDEN _sk_store_f32_hsw
 .globl _sk_store_f32_hsw
 _sk_store_f32_hsw:
@@ -6205,7 +6274,7 @@
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           1d16 <_sk_store_f32_hsw+0x6d>
+  .byte  117,55                              // jne           1dd5 <_sk_store_f32_hsw+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -6218,22 +6287,22 @@
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  116,240                             // je            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  114,227                             // jb            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  116,218                             // je            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  114,205                             // jb            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  116,195                             // je            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            1d12 <_sk_store_f32_hsw+0x69>
+  .byte  114,181                             // jb            1dd1 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           1d12 <_sk_store_f32_hsw+0x69>
+  .byte  235,171                             // jmp           1dd1 <_sk_store_f32_hsw+0x69>
 
 HIDDEN _sk_clamp_x_hsw
 .globl _sk_clamp_x_hsw
@@ -9016,6 +9085,57 @@
   .byte  196,65,121,214,68,248,48            // vmovq         %xmm8,0x30(%r8,%rdi,8)
   .byte  235,174                             // jmp           255a <_sk_store_u16_be_avx+0xf6>
 
+HIDDEN _sk_load_f32_avx
+.globl _sk_load_f32_avx
+_sk_load_f32_avx:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,249,7                        // cmp           $0x7,%rcx
+  .byte  119,110                             // ja            2622 <_sk_load_f32_avx+0x76>
+  .byte  76,139,0                            // mov           (%rax),%r8
+  .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
+  .byte  76,141,21,134,0,0,0                 // lea           0x86(%rip),%r10        # 264c <_sk_load_f32_avx+0xa0>
+  .byte  73,99,4,138                         // movslq        (%r10,%rcx,4),%rax
+  .byte  76,1,208                            // add           %r10,%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  196,3,125,24,68,136,112,1           // vinsertf128   $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
+  .byte  196,131,125,24,92,136,96,1          // vinsertf128   $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
+  .byte  196,131,125,24,76,136,80,1          // vinsertf128   $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
+  .byte  196,131,125,24,84,136,64,1          // vinsertf128   $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
+  .byte  196,129,121,16,68,136,48            // vmovupd       0x30(%r8,%r9,4),%xmm0
+  .byte  196,195,125,13,192,12               // vblendpd      $0xc,%ymm8,%ymm0,%ymm0
+  .byte  196,1,121,16,68,136,32              // vmovupd       0x20(%r8,%r9,4),%xmm8
+  .byte  196,99,61,13,203,12                 // vblendpd      $0xc,%ymm3,%ymm8,%ymm9
+  .byte  196,129,121,16,92,136,16            // vmovupd       0x10(%r8,%r9,4),%xmm3
+  .byte  196,99,101,13,209,12                // vblendpd      $0xc,%ymm1,%ymm3,%ymm10
+  .byte  196,129,121,16,12,136               // vmovupd       (%r8,%r9,4),%xmm1
+  .byte  196,227,117,13,202,12               // vblendpd      $0xc,%ymm2,%ymm1,%ymm1
+  .byte  196,193,116,20,210                  // vunpcklps     %ymm10,%ymm1,%ymm2
+  .byte  196,193,116,21,218                  // vunpckhps     %ymm10,%ymm1,%ymm3
+  .byte  197,180,20,200                      // vunpcklps     %ymm0,%ymm9,%ymm1
+  .byte  197,52,21,192                       // vunpckhps     %ymm0,%ymm9,%ymm8
+  .byte  197,237,20,193                      // vunpcklpd     %ymm1,%ymm2,%ymm0
+  .byte  197,237,21,201                      // vunpckhpd     %ymm1,%ymm2,%ymm1
+  .byte  196,193,101,20,208                  // vunpcklpd     %ymm8,%ymm3,%ymm2
+  .byte  196,193,101,21,216                  // vunpckhpd     %ymm8,%ymm3,%ymm3
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  102,144                             // xchg          %ax,%ax
+  .byte  131,255,255                         // cmp           $0xffffffff,%edi
+  .byte  255,202                             // dec           %edx
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  189,255,255,255,176                 // mov           $0xb0ffffff,%ebp
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,163,255,255,255,155             // jmpq          *-0x64000001(%rbx)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255,147,255,255,255,139             // callq         *-0x74000001(%rbx)
+  .byte  255                                 // (bad)
+  .byte  255                                 // (bad)
+  .byte  255                                 // .byte         0xff
+
 HIDDEN _sk_store_f32_avx
 .globl _sk_store_f32_avx
 _sk_store_f32_avx:
@@ -9031,7 +9151,7 @@
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           2619 <_sk_store_f32_avx+0x6d>
+  .byte  117,55                              // jne           26d9 <_sk_store_f32_avx+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -9044,22 +9164,22 @@
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            2615 <_sk_store_f32_avx+0x69>
+  .byte  116,240                             // je            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            2615 <_sk_store_f32_avx+0x69>
+  .byte  114,227                             // jb            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            2615 <_sk_store_f32_avx+0x69>
+  .byte  116,218                             // je            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            2615 <_sk_store_f32_avx+0x69>
+  .byte  114,205                             // jb            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            2615 <_sk_store_f32_avx+0x69>
+  .byte  116,195                             // je            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            2615 <_sk_store_f32_avx+0x69>
+  .byte  114,181                             // jb            26d5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           2615 <_sk_store_f32_avx+0x69>
+  .byte  235,171                             // jmp           26d5 <_sk_store_f32_avx+0x69>
 
 HIDDEN _sk_clamp_x_avx
 .globl _sk_clamp_x_avx
@@ -11340,6 +11460,32 @@
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
+HIDDEN _sk_load_f32_sse41
+.globl _sk_load_f32_sse41
+_sk_load_f32_sse41:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,139,0                            // mov           (%rax),%rax
+  .byte  72,137,249                          // mov           %rdi,%rcx
+  .byte  72,193,225,4                        // shl           $0x4,%rcx
+  .byte  68,15,16,4,8                        // movups        (%rax,%rcx,1),%xmm8
+  .byte  15,16,68,8,16                       // movups        0x10(%rax,%rcx,1),%xmm0
+  .byte  15,16,92,8,32                       // movups        0x20(%rax,%rcx,1),%xmm3
+  .byte  68,15,16,76,8,48                    // movups        0x30(%rax,%rcx,1),%xmm9
+  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
+  .byte  15,20,208                           // unpcklps      %xmm0,%xmm2
+  .byte  15,40,203                           // movaps        %xmm3,%xmm1
+  .byte  65,15,20,201                        // unpcklps      %xmm9,%xmm1
+  .byte  68,15,21,192                        // unpckhps      %xmm0,%xmm8
+  .byte  65,15,21,217                        // unpckhps      %xmm9,%xmm3
+  .byte  15,40,194                           // movaps        %xmm2,%xmm0
+  .byte  102,15,20,193                       // unpcklpd      %xmm1,%xmm0
+  .byte  15,18,202                           // movhlps       %xmm2,%xmm1
+  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
+  .byte  102,15,20,211                       // unpcklpd      %xmm3,%xmm2
+  .byte  65,15,18,216                        // movhlps       %xmm8,%xmm3
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 HIDDEN _sk_store_f32_sse41
 .globl _sk_store_f32_sse41
 _sk_store_f32_sse41:
@@ -13771,6 +13917,32 @@
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
+HIDDEN _sk_load_f32_sse2
+.globl _sk_load_f32_sse2
+_sk_load_f32_sse2:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,139,0                            // mov           (%rax),%rax
+  .byte  72,137,249                          // mov           %rdi,%rcx
+  .byte  72,193,225,4                        // shl           $0x4,%rcx
+  .byte  68,15,16,4,8                        // movups        (%rax,%rcx,1),%xmm8
+  .byte  15,16,68,8,16                       // movups        0x10(%rax,%rcx,1),%xmm0
+  .byte  15,16,92,8,32                       // movups        0x20(%rax,%rcx,1),%xmm3
+  .byte  68,15,16,76,8,48                    // movups        0x30(%rax,%rcx,1),%xmm9
+  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
+  .byte  15,20,208                           // unpcklps      %xmm0,%xmm2
+  .byte  15,40,203                           // movaps        %xmm3,%xmm1
+  .byte  65,15,20,201                        // unpcklps      %xmm9,%xmm1
+  .byte  68,15,21,192                        // unpckhps      %xmm0,%xmm8
+  .byte  65,15,21,217                        // unpckhps      %xmm9,%xmm3
+  .byte  15,40,194                           // movaps        %xmm2,%xmm0
+  .byte  102,15,20,193                       // unpcklpd      %xmm1,%xmm0
+  .byte  15,18,202                           // movhlps       %xmm2,%xmm1
+  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
+  .byte  102,15,20,211                       // unpcklpd      %xmm3,%xmm2
+  .byte  65,15,18,216                        // movhlps       %xmm8,%xmm3
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 HIDDEN _sk_store_f32_sse2
 .globl _sk_store_f32_sse2
 _sk_store_f32_sse2:
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index 4ec22db..2fa69f5 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -1607,7 +1607,7 @@
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  233,255,255,255,225                 ; jmpq          ffffffffe20016ec <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff563>
+  DB  233,255,255,255,225                 ; jmpq          ffffffffe20016ec <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4a4>
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -2022,6 +2022,55 @@
   DB  196,65,121,214,68,248,48            ; vmovq         %xmm8,0x30(%r8,%rdi,8)
   DB  235,174                             ; jmp           1cf3 <_sk_store_u16_be_hsw+0xef>
 
+PUBLIC _sk_load_f32_hsw
+_sk_load_f32_hsw LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,131,249,7                        ; cmp           $0x7,%rcx
+  DB  119,110                             ; ja            1dbb <_sk_load_f32_hsw+0x76>
+  DB  76,139,0                            ; mov           (%rax),%r8
+  DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
+  DB  76,141,21,133,0,0,0                 ; lea           0x85(%rip),%r10        # 1de4 <_sk_load_f32_hsw+0x9f>
+  DB  73,99,4,138                         ; movslq        (%r10,%rcx,4),%rax
+  DB  76,1,208                            ; add           %r10,%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  196,3,125,24,68,136,112,1           ; vinsertf128   $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
+  DB  196,131,125,24,92,136,96,1          ; vinsertf128   $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
+  DB  196,131,125,24,76,136,80,1          ; vinsertf128   $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
+  DB  196,131,125,24,84,136,64,1          ; vinsertf128   $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
+  DB  196,129,121,16,68,136,48            ; vmovupd       0x30(%r8,%r9,4),%xmm0
+  DB  196,195,125,13,192,12               ; vblendpd      $0xc,%ymm8,%ymm0,%ymm0
+  DB  196,1,121,16,68,136,32              ; vmovupd       0x20(%r8,%r9,4),%xmm8
+  DB  196,99,61,13,203,12                 ; vblendpd      $0xc,%ymm3,%ymm8,%ymm9
+  DB  196,129,121,16,92,136,16            ; vmovupd       0x10(%r8,%r9,4),%xmm3
+  DB  196,99,101,13,209,12                ; vblendpd      $0xc,%ymm1,%ymm3,%ymm10
+  DB  196,129,121,16,12,136               ; vmovupd       (%r8,%r9,4),%xmm1
+  DB  196,227,117,13,202,12               ; vblendpd      $0xc,%ymm2,%ymm1,%ymm1
+  DB  196,193,116,20,210                  ; vunpcklps     %ymm10,%ymm1,%ymm2
+  DB  196,193,116,21,218                  ; vunpckhps     %ymm10,%ymm1,%ymm3
+  DB  197,180,20,200                      ; vunpcklps     %ymm0,%ymm9,%ymm1
+  DB  197,52,21,192                       ; vunpckhps     %ymm0,%ymm9,%ymm8
+  DB  197,237,20,193                      ; vunpcklpd     %ymm1,%ymm2,%ymm0
+  DB  197,237,21,201                      ; vunpckhpd     %ymm1,%ymm2,%ymm1
+  DB  196,193,101,20,208                  ; vunpcklpd     %ymm8,%ymm3,%ymm2
+  DB  196,193,101,21,216                  ; vunpckhpd     %ymm8,%ymm3,%ymm3
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  144                                 ; nop
+  DB  132,255                             ; test          %bh,%bh
+  DB  255                                 ; (bad)
+  DB  255,203                             ; dec           %ebx
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  190,255,255,255,177                 ; mov           $0xb1ffffff,%esi
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,164,255,255,255,156,255         ; jmpq          *-0x630001(%rdi,%rdi,8)
+  DB  255                                 ; (bad)
+  DB  255,148,255,255,255,140,255         ; callq         *-0x730001(%rdi,%rdi,8)
+  DB  255                                 ; (bad)
+  DB  255                                 ; .byte         0xff
+
 PUBLIC _sk_store_f32_hsw
 _sk_store_f32_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -2036,7 +2085,7 @@
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           1db2 <_sk_store_f32_hsw+0x6d>
+  DB  117,55                              ; jne           1e71 <_sk_store_f32_hsw+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -2049,22 +2098,22 @@
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            1dae <_sk_store_f32_hsw+0x69>
+  DB  116,240                             ; je            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            1dae <_sk_store_f32_hsw+0x69>
+  DB  114,227                             ; jb            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            1dae <_sk_store_f32_hsw+0x69>
+  DB  116,218                             ; je            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            1dae <_sk_store_f32_hsw+0x69>
+  DB  114,205                             ; jb            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            1dae <_sk_store_f32_hsw+0x69>
+  DB  116,195                             ; je            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            1dae <_sk_store_f32_hsw+0x69>
+  DB  114,181                             ; jb            1e6d <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           1dae <_sk_store_f32_hsw+0x69>
+  DB  235,171                             ; jmp           1e6d <_sk_store_f32_hsw+0x69>
 
 PUBLIC _sk_clamp_x_hsw
 _sk_clamp_x_hsw LABEL PROC
@@ -4804,6 +4853,56 @@
   DB  196,65,121,214,68,248,48            ; vmovq         %xmm8,0x30(%r8,%rdi,8)
   DB  235,174                             ; jmp           25f6 <_sk_store_u16_be_avx+0xf6>
 
+PUBLIC _sk_load_f32_avx
+_sk_load_f32_avx LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,131,249,7                        ; cmp           $0x7,%rcx
+  DB  119,110                             ; ja            26be <_sk_load_f32_avx+0x76>
+  DB  76,139,0                            ; mov           (%rax),%r8
+  DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
+  DB  76,141,21,134,0,0,0                 ; lea           0x86(%rip),%r10        # 26e8 <_sk_load_f32_avx+0xa0>
+  DB  73,99,4,138                         ; movslq        (%r10,%rcx,4),%rax
+  DB  76,1,208                            ; add           %r10,%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  196,3,125,24,68,136,112,1           ; vinsertf128   $0x1,0x70(%r8,%r9,4),%ymm0,%ymm8
+  DB  196,131,125,24,92,136,96,1          ; vinsertf128   $0x1,0x60(%r8,%r9,4),%ymm0,%ymm3
+  DB  196,131,125,24,76,136,80,1          ; vinsertf128   $0x1,0x50(%r8,%r9,4),%ymm0,%ymm1
+  DB  196,131,125,24,84,136,64,1          ; vinsertf128   $0x1,0x40(%r8,%r9,4),%ymm0,%ymm2
+  DB  196,129,121,16,68,136,48            ; vmovupd       0x30(%r8,%r9,4),%xmm0
+  DB  196,195,125,13,192,12               ; vblendpd      $0xc,%ymm8,%ymm0,%ymm0
+  DB  196,1,121,16,68,136,32              ; vmovupd       0x20(%r8,%r9,4),%xmm8
+  DB  196,99,61,13,203,12                 ; vblendpd      $0xc,%ymm3,%ymm8,%ymm9
+  DB  196,129,121,16,92,136,16            ; vmovupd       0x10(%r8,%r9,4),%xmm3
+  DB  196,99,101,13,209,12                ; vblendpd      $0xc,%ymm1,%ymm3,%ymm10
+  DB  196,129,121,16,12,136               ; vmovupd       (%r8,%r9,4),%xmm1
+  DB  196,227,117,13,202,12               ; vblendpd      $0xc,%ymm2,%ymm1,%ymm1
+  DB  196,193,116,20,210                  ; vunpcklps     %ymm10,%ymm1,%ymm2
+  DB  196,193,116,21,218                  ; vunpckhps     %ymm10,%ymm1,%ymm3
+  DB  197,180,20,200                      ; vunpcklps     %ymm0,%ymm9,%ymm1
+  DB  197,52,21,192                       ; vunpckhps     %ymm0,%ymm9,%ymm8
+  DB  197,237,20,193                      ; vunpcklpd     %ymm1,%ymm2,%ymm0
+  DB  197,237,21,201                      ; vunpckhpd     %ymm1,%ymm2,%ymm1
+  DB  196,193,101,20,208                  ; vunpcklpd     %ymm8,%ymm3,%ymm2
+  DB  196,193,101,21,216                  ; vunpckhpd     %ymm8,%ymm3,%ymm3
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  102,144                             ; xchg          %ax,%ax
+  DB  131,255,255                         ; cmp           $0xffffffff,%edi
+  DB  255,202                             ; dec           %edx
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  189,255,255,255,176                 ; mov           $0xb0ffffff,%ebp
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,163,255,255,255,155             ; jmpq          *-0x64000001(%rbx)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255,147,255,255,255,139             ; callq         *-0x74000001(%rbx)
+  DB  255                                 ; (bad)
+  DB  255                                 ; (bad)
+  DB  255                                 ; .byte         0xff
+
 PUBLIC _sk_store_f32_avx
 _sk_store_f32_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -4818,7 +4917,7 @@
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           26b5 <_sk_store_f32_avx+0x6d>
+  DB  117,55                              ; jne           2775 <_sk_store_f32_avx+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -4831,22 +4930,22 @@
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            26b1 <_sk_store_f32_avx+0x69>
+  DB  116,240                             ; je            2771 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            26b1 <_sk_store_f32_avx+0x69>
+  DB  114,227                             ; jb            2771 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            26b1 <_sk_store_f32_avx+0x69>
+  DB  116,218                             ; je            2771 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            26b1 <_sk_store_f32_avx+0x69>
+  DB  114,205                             ; jb            2771 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            26b1 <_sk_store_f32_avx+0x69>
+  DB  116,195                             ; je            2771 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            26b1 <_sk_store_f32_avx+0x69>
+  DB  114,181                             ; jb            2771 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           26b1 <_sk_store_f32_avx+0x69>
+  DB  235,171                             ; jmp           2771 <_sk_store_f32_avx+0x69>
 
 PUBLIC _sk_clamp_x_avx
 _sk_clamp_x_avx LABEL PROC
@@ -7088,6 +7187,31 @@
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_load_f32_sse41
+_sk_load_f32_sse41 LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                            ; mov           (%rax),%rax
+  DB  72,137,249                          ; mov           %rdi,%rcx
+  DB  72,193,225,4                        ; shl           $0x4,%rcx
+  DB  68,15,16,4,8                        ; movups        (%rax,%rcx,1),%xmm8
+  DB  15,16,68,8,16                       ; movups        0x10(%rax,%rcx,1),%xmm0
+  DB  15,16,92,8,32                       ; movups        0x20(%rax,%rcx,1),%xmm3
+  DB  68,15,16,76,8,48                    ; movups        0x30(%rax,%rcx,1),%xmm9
+  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
+  DB  15,20,208                           ; unpcklps      %xmm0,%xmm2
+  DB  15,40,203                           ; movaps        %xmm3,%xmm1
+  DB  65,15,20,201                        ; unpcklps      %xmm9,%xmm1
+  DB  68,15,21,192                        ; unpckhps      %xmm0,%xmm8
+  DB  65,15,21,217                        ; unpckhps      %xmm9,%xmm3
+  DB  15,40,194                           ; movaps        %xmm2,%xmm0
+  DB  102,15,20,193                       ; unpcklpd      %xmm1,%xmm0
+  DB  15,18,202                           ; movhlps       %xmm2,%xmm1
+  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
+  DB  102,15,20,211                       ; unpcklpd      %xmm3,%xmm2
+  DB  65,15,18,216                        ; movhlps       %xmm8,%xmm3
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_store_f32_sse41
 _sk_store_f32_sse41 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -9477,6 +9601,31 @@
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_load_f32_sse2
+_sk_load_f32_sse2 LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                            ; mov           (%rax),%rax
+  DB  72,137,249                          ; mov           %rdi,%rcx
+  DB  72,193,225,4                        ; shl           $0x4,%rcx
+  DB  68,15,16,4,8                        ; movups        (%rax,%rcx,1),%xmm8
+  DB  15,16,68,8,16                       ; movups        0x10(%rax,%rcx,1),%xmm0
+  DB  15,16,92,8,32                       ; movups        0x20(%rax,%rcx,1),%xmm3
+  DB  68,15,16,76,8,48                    ; movups        0x30(%rax,%rcx,1),%xmm9
+  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
+  DB  15,20,208                           ; unpcklps      %xmm0,%xmm2
+  DB  15,40,203                           ; movaps        %xmm3,%xmm1
+  DB  65,15,20,201                        ; unpcklps      %xmm9,%xmm1
+  DB  68,15,21,192                        ; unpckhps      %xmm0,%xmm8
+  DB  65,15,21,217                        ; unpckhps      %xmm9,%xmm3
+  DB  15,40,194                           ; movaps        %xmm2,%xmm0
+  DB  102,15,20,193                       ; unpcklpd      %xmm1,%xmm0
+  DB  15,18,202                           ; movhlps       %xmm2,%xmm1
+  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
+  DB  102,15,20,211                       ; unpcklpd      %xmm3,%xmm2
+  DB  65,15,18,216                        ; movhlps       %xmm8,%xmm3
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_store_f32_sse2
 _sk_store_f32_sse2 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index a8d5a29..6e0c908 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -673,6 +673,10 @@
     store4((uint16_t*)ptr,tail, R,G,B,A);
 }
 
+STAGE(load_f32) {
+    auto ptr = *(const float**)ctx + 4*x;
+    load4(ptr,tail, &r,&g,&b,&a);
+}
 STAGE(store_f32) {
     auto ptr = *(float**)ctx + 4*x;
     store4(ptr,tail, r,g,b,a);
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index a829e72..01c0ccf 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -53,6 +53,13 @@
         ptr[2] = b;
         ptr[3] = a;
     }
+
+    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
+        *r = ptr[0];
+        *g = ptr[1];
+        *b = ptr[2];
+        *a = ptr[3];
+    }
     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         ptr[0] = r;
         ptr[1] = g;
@@ -106,6 +113,14 @@
     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
         vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
     }
+
+    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
+        float32x4x4_t rgba = vld4q_f32(ptr);
+        *r = rgba.val[0];
+        *g = rgba.val[1];
+        *b = rgba.val[2];
+        *a = rgba.val[3];
+    }
     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
@@ -164,6 +179,14 @@
         vst4_lane_u16(ptr + 0, rgba, 0);
         vst4_lane_u16(ptr + 4, rgba, 1);
     }
+
+    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
+        float32x2x4_t rgba = vld4_f32(ptr);
+        *r = rgba.val[0];
+        *g = rgba.val[1];
+        *b = rgba.val[2];
+        *a = rgba.val[3];
+    }
     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
     }
@@ -285,6 +308,31 @@
             _mm_storeu_si128((__m128i*)ptr + 3, _67);
         }
     }
+
+    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
+        F _04, _15, _26, _37;
+
+        switch (tail) {
+            case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
+            case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
+            case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
+            case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
+            case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0);
+            case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0);
+            case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0);
+            case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
+        }
+
+        F rg0145 = _mm256_unpacklo_ps(_04,_15),  // r0 r1 g0 g1 | r4 r5 g4 g5
+          ba0145 = _mm256_unpackhi_ps(_04,_15),
+          rg2367 = _mm256_unpacklo_ps(_26,_37),
+          ba2367 = _mm256_unpackhi_ps(_26,_37);
+
+        *r = _mm256_unpacklo_pd(rg0145, rg2367);
+        *g = _mm256_unpackhi_pd(rg0145, rg2367);
+        *b = _mm256_unpacklo_pd(ba0145, ba2367);
+        *a = _mm256_unpackhi_pd(ba0145, ba2367);
+    }
     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5
           rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ...
@@ -408,6 +456,18 @@
         _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
         _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
     }
+
+    SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
+        auto _0 = _mm_loadu_ps(ptr+ 0),
+             _1 = _mm_loadu_ps(ptr+ 4),
+             _2 = _mm_loadu_ps(ptr+ 8),
+             _3 = _mm_loadu_ps(ptr+12);
+        _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
+        *r = _0;
+        *g = _1;
+        *b = _2;
+        *a = _3;
+    }
     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
         _MM_TRANSPOSE4_PS(r,g,b,a);
         _mm_storeu_ps(ptr+ 0, r);