SkJumper: store_f32

Change-Id: I4bc6d1a8787c540fd1a29274650d34392e56651c
Reviewed-on: https://skia-review.googlesource.com/9223
Reviewed-by: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index cf36433..71c8644 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -79,6 +79,7 @@
     M(store_8888)         \
     M(load_f16)           \
     M(store_f16)          \
+    M(store_f32)          \
     M(matrix_2x3)         \
     M(matrix_3x4)         \
     M(matrix_perspective) \
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 776710a..34aeab4 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -679,6 +679,16 @@
   .long  0x91004021                          // add           x1, x1, #0x10
   .long  0xd61f0060                          // br            x3
 
+.globl _sk_store_f32_aarch64
+_sk_store_f32_aarch64:
+  .long  0xf9400028                          // ldr           x8, [x1]
+  .long  0xf9400108                          // ldr           x8, [x8]
+  .long  0x8b001108                          // add           x8, x8, x0, lsl #4
+  .long  0x4c000900                          // st4           {v0.4s-v3.4s}, [x8]
+  .long  0xf9400423                          // ldr           x3, [x1,#8]
+  .long  0x91004021                          // add           x1, x1, #0x10
+  .long  0xd61f0060                          // br            x3
+
 .globl _sk_clamp_x_aarch64
 _sk_clamp_x_aarch64:
   .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
@@ -1564,6 +1574,17 @@
   .long  0xe1a01003                          // mov           r1, r3
   .long  0xe12fff1c                          // bx            ip
 
+.globl _sk_store_f32_vfp4
+_sk_store_f32_vfp4:
+  .long  0xe5913000                          // ldr           r3, [r1]
+  .long  0xe5933000                          // ldr           r3, [r3]
+  .long  0xe0833200                          // add           r3, r3, r0, lsl #4
+  .long  0xf403008f                          // vst4.32       {d0-d3}, [r3]
+  .long  0xe2813008                          // add           r3, r1, #8
+  .long  0xe591c004                          // ldr           ip, [r1, #4]
+  .long  0xe1a01003                          // mov           r1, r3
+  .long  0xe12fff1c                          // bx            ip
+
 .globl _sk_clamp_x_vfp4
 _sk_clamp_x_vfp4:
   .long  0xe8911008                          // ldm           r1, {r3, ip}
@@ -2744,6 +2765,50 @@
   .byte  197,121,214,68,248,48               // vmovq         %xmm8,0x30(%rax,%rdi,8)
   .byte  235,181                             // jmp           be4 <_sk_store_f16_hsw+0x61>
 
+.globl _sk_store_f32_hsw
+_sk_store_f32_hsw:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  76,139,0                            // mov           (%rax),%r8
+  .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
+  .byte  197,124,20,193                      // vunpcklps     %ymm1,%ymm0,%ymm8
+  .byte  197,124,21,217                      // vunpckhps     %ymm1,%ymm0,%ymm11
+  .byte  197,108,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm9
+  .byte  197,108,21,227                      // vunpckhps     %ymm3,%ymm2,%ymm12
+  .byte  196,65,61,20,209                    // vunpcklpd     %ymm9,%ymm8,%ymm10
+  .byte  196,65,61,21,201                    // vunpckhpd     %ymm9,%ymm8,%ymm9
+  .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
+  .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
+  .byte  72,133,201                          // test          %rcx,%rcx
+  .byte  117,55                              // jne           c9c <_sk_store_f32_hsw+0x6d>
+  .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  .byte  196,67,61,6,195,49                  // vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  .byte  196,65,125,17,36,128                // vmovupd       %ymm12,(%r8,%rax,4)
+  .byte  196,65,125,17,108,128,32            // vmovupd       %ymm13,0x20(%r8,%rax,4)
+  .byte  196,65,125,17,76,128,64             // vmovupd       %ymm9,0x40(%r8,%rax,4)
+  .byte  196,65,125,17,68,128,96             // vmovupd       %ymm8,0x60(%r8,%rax,4)
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
+  .byte  72,131,249,1                        // cmp           $0x1,%rcx
+  .byte  116,240                             // je            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
+  .byte  72,131,249,3                        // cmp           $0x3,%rcx
+  .byte  114,227                             // jb            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
+  .byte  116,218                             // je            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
+  .byte  72,131,249,5                        // cmp           $0x5,%rcx
+  .byte  114,205                             // jb            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  .byte  116,195                             // je            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  .byte  72,131,249,7                        // cmp           $0x7,%rcx
+  .byte  114,181                             // jb            c98 <_sk_store_f32_hsw+0x69>
+  .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  .byte  235,171                             // jmp           c98 <_sk_store_f32_hsw+0x69>
+
 .globl _sk_clamp_x_hsw
 _sk_clamp_x_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -4158,6 +4223,50 @@
   .byte  197,121,214,68,248,48               // vmovq         %xmm8,0x30(%rax,%rdi,8)
   .byte  235,181                             // jmp           115f <_sk_store_f16_avx+0xbf>
 
+.globl _sk_store_f32_avx
+_sk_store_f32_avx:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  76,139,0                            // mov           (%rax),%r8
+  .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
+  .byte  197,124,20,193                      // vunpcklps     %ymm1,%ymm0,%ymm8
+  .byte  197,124,21,217                      // vunpckhps     %ymm1,%ymm0,%ymm11
+  .byte  197,108,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm9
+  .byte  197,108,21,227                      // vunpckhps     %ymm3,%ymm2,%ymm12
+  .byte  196,65,61,20,209                    // vunpcklpd     %ymm9,%ymm8,%ymm10
+  .byte  196,65,61,21,201                    // vunpckhpd     %ymm9,%ymm8,%ymm9
+  .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
+  .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
+  .byte  72,133,201                          // test          %rcx,%rcx
+  .byte  117,55                              // jne           1217 <_sk_store_f32_avx+0x6d>
+  .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  .byte  196,67,61,6,195,49                  // vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  .byte  196,65,125,17,36,128                // vmovupd       %ymm12,(%r8,%rax,4)
+  .byte  196,65,125,17,108,128,32            // vmovupd       %ymm13,0x20(%r8,%rax,4)
+  .byte  196,65,125,17,76,128,64             // vmovupd       %ymm9,0x40(%r8,%rax,4)
+  .byte  196,65,125,17,68,128,96             // vmovupd       %ymm8,0x60(%r8,%rax,4)
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+  .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
+  .byte  72,131,249,1                        // cmp           $0x1,%rcx
+  .byte  116,240                             // je            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
+  .byte  72,131,249,3                        // cmp           $0x3,%rcx
+  .byte  114,227                             // jb            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
+  .byte  116,218                             // je            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
+  .byte  72,131,249,5                        // cmp           $0x5,%rcx
+  .byte  114,205                             // jb            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  .byte  116,195                             // je            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  .byte  72,131,249,7                        // cmp           $0x7,%rcx
+  .byte  114,181                             // jb            1213 <_sk_store_f32_avx+0x69>
+  .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  .byte  235,171                             // jmp           1213 <_sk_store_f32_avx+0x69>
+
 .globl _sk_clamp_x_avx
 _sk_clamp_x_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -5137,6 +5246,33 @@
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_store_f32_sse41
+_sk_store_f32_sse41:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,139,0                            // mov           (%rax),%rax
+  .byte  72,137,249                          // mov           %rdi,%rcx
+  .byte  72,193,225,4                        // shl           $0x4,%rcx
+  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
+  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
+  .byte  68,15,20,201                        // unpcklps      %xmm1,%xmm9
+  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
+  .byte  68,15,40,218                        // movaps        %xmm2,%xmm11
+  .byte  68,15,20,219                        // unpcklps      %xmm3,%xmm11
+  .byte  68,15,21,193                        // unpckhps      %xmm1,%xmm8
+  .byte  68,15,21,211                        // unpckhps      %xmm3,%xmm10
+  .byte  69,15,40,225                        // movaps        %xmm9,%xmm12
+  .byte  102,69,15,20,227                    // unpcklpd      %xmm11,%xmm12
+  .byte  102,69,15,21,203                    // unpckhpd      %xmm11,%xmm9
+  .byte  69,15,40,216                        // movaps        %xmm8,%xmm11
+  .byte  102,69,15,20,218                    // unpcklpd      %xmm10,%xmm11
+  .byte  102,69,15,21,194                    // unpckhpd      %xmm10,%xmm8
+  .byte  102,68,15,17,36,8                   // movupd        %xmm12,(%rax,%rcx,1)
+  .byte  102,68,15,17,76,8,16                // movupd        %xmm9,0x10(%rax,%rcx,1)
+  .byte  102,68,15,17,92,8,32                // movupd        %xmm11,0x20(%rax,%rcx,1)
+  .byte  102,68,15,17,68,8,48                // movupd        %xmm8,0x30(%rax,%rcx,1)
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_clamp_x_sse41
 _sk_clamp_x_sse41:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
@@ -6170,6 +6306,33 @@
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
+.globl _sk_store_f32_sse2
+_sk_store_f32_sse2:
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,139,0                            // mov           (%rax),%rax
+  .byte  72,137,249                          // mov           %rdi,%rcx
+  .byte  72,193,225,4                        // shl           $0x4,%rcx
+  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
+  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
+  .byte  68,15,20,201                        // unpcklps      %xmm1,%xmm9
+  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
+  .byte  68,15,40,218                        // movaps        %xmm2,%xmm11
+  .byte  68,15,20,219                        // unpcklps      %xmm3,%xmm11
+  .byte  68,15,21,193                        // unpckhps      %xmm1,%xmm8
+  .byte  68,15,21,211                        // unpckhps      %xmm3,%xmm10
+  .byte  69,15,40,225                        // movaps        %xmm9,%xmm12
+  .byte  102,69,15,20,227                    // unpcklpd      %xmm11,%xmm12
+  .byte  102,69,15,21,203                    // unpckhpd      %xmm11,%xmm9
+  .byte  69,15,40,216                        // movaps        %xmm8,%xmm11
+  .byte  102,69,15,20,218                    // unpcklpd      %xmm10,%xmm11
+  .byte  102,69,15,21,194                    // unpckhpd      %xmm10,%xmm8
+  .byte  102,68,15,17,36,8                   // movupd        %xmm12,(%rax,%rcx,1)
+  .byte  102,68,15,17,76,8,16                // movupd        %xmm9,0x10(%rax,%rcx,1)
+  .byte  102,68,15,17,92,8,32                // movupd        %xmm11,0x20(%rax,%rcx,1)
+  .byte  102,68,15,17,68,8,48                // movupd        %xmm8,0x30(%rax,%rcx,1)
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  255,224                             // jmpq          *%rax
+
 .globl _sk_clamp_x_sse2
 _sk_clamp_x_sse2:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d84a253..6afcfca 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -958,6 +958,50 @@
   DB  197,121,214,68,248,48               ; vmovq         %xmm8,0x30(%rax,%rdi,8)
   DB  235,181                             ; jmp           c7c <_sk_store_f16_hsw+0x61>
 
+PUBLIC _sk_store_f32_hsw
+_sk_store_f32_hsw LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  76,139,0                            ; mov           (%rax),%r8
+  DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
+  DB  197,124,20,193                      ; vunpcklps     %ymm1,%ymm0,%ymm8
+  DB  197,124,21,217                      ; vunpckhps     %ymm1,%ymm0,%ymm11
+  DB  197,108,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm9
+  DB  197,108,21,227                      ; vunpckhps     %ymm3,%ymm2,%ymm12
+  DB  196,65,61,20,209                    ; vunpcklpd     %ymm9,%ymm8,%ymm10
+  DB  196,65,61,21,201                    ; vunpckhpd     %ymm9,%ymm8,%ymm9
+  DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
+  DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
+  DB  72,133,201                          ; test          %rcx,%rcx
+  DB  117,55                              ; jne           d34 <_sk_store_f32_hsw+0x6d>
+  DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  DB  196,67,61,6,195,49                  ; vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  DB  196,65,125,17,36,128                ; vmovupd       %ymm12,(%r8,%rax,4)
+  DB  196,65,125,17,108,128,32            ; vmovupd       %ymm13,0x20(%r8,%rax,4)
+  DB  196,65,125,17,76,128,64             ; vmovupd       %ymm9,0x40(%r8,%rax,4)
+  DB  196,65,125,17,68,128,96             ; vmovupd       %ymm8,0x60(%r8,%rax,4)
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
+  DB  72,131,249,1                        ; cmp           $0x1,%rcx
+  DB  116,240                             ; je            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
+  DB  72,131,249,3                        ; cmp           $0x3,%rcx
+  DB  114,227                             ; jb            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
+  DB  116,218                             ; je            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
+  DB  72,131,249,5                        ; cmp           $0x5,%rcx
+  DB  114,205                             ; jb            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  DB  116,195                             ; je            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  DB  72,131,249,7                        ; cmp           $0x7,%rcx
+  DB  114,181                             ; jb            d30 <_sk_store_f32_hsw+0x69>
+  DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  DB  235,171                             ; jmp           d30 <_sk_store_f32_hsw+0x69>
+
 PUBLIC _sk_clamp_x_hsw
 _sk_clamp_x_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -2399,6 +2443,50 @@
   DB  197,121,214,68,248,48               ; vmovq         %xmm8,0x30(%rax,%rdi,8)
   DB  235,181                             ; jmp           11f7 <_sk_store_f16_avx+0xbf>
 
+PUBLIC _sk_store_f32_avx
+_sk_store_f32_avx LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  76,139,0                            ; mov           (%rax),%r8
+  DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
+  DB  197,124,20,193                      ; vunpcklps     %ymm1,%ymm0,%ymm8
+  DB  197,124,21,217                      ; vunpckhps     %ymm1,%ymm0,%ymm11
+  DB  197,108,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm9
+  DB  197,108,21,227                      ; vunpckhps     %ymm3,%ymm2,%ymm12
+  DB  196,65,61,20,209                    ; vunpcklpd     %ymm9,%ymm8,%ymm10
+  DB  196,65,61,21,201                    ; vunpckhpd     %ymm9,%ymm8,%ymm9
+  DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
+  DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
+  DB  72,133,201                          ; test          %rcx,%rcx
+  DB  117,55                              ; jne           12af <_sk_store_f32_avx+0x6d>
+  DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  DB  196,67,61,6,195,49                  ; vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  DB  196,65,125,17,36,128                ; vmovupd       %ymm12,(%r8,%rax,4)
+  DB  196,65,125,17,108,128,32            ; vmovupd       %ymm13,0x20(%r8,%rax,4)
+  DB  196,65,125,17,76,128,64             ; vmovupd       %ymm9,0x40(%r8,%rax,4)
+  DB  196,65,125,17,68,128,96             ; vmovupd       %ymm8,0x60(%r8,%rax,4)
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+  DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
+  DB  72,131,249,1                        ; cmp           $0x1,%rcx
+  DB  116,240                             ; je            12ab <_sk_store_f32_avx+0x69>
+  DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
+  DB  72,131,249,3                        ; cmp           $0x3,%rcx
+  DB  114,227                             ; jb            12ab <_sk_store_f32_avx+0x69>
+  DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
+  DB  116,218                             ; je            12ab <_sk_store_f32_avx+0x69>
+  DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
+  DB  72,131,249,5                        ; cmp           $0x5,%rcx
+  DB  114,205                             ; jb            12ab <_sk_store_f32_avx+0x69>
+  DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  DB  116,195                             ; je            12ab <_sk_store_f32_avx+0x69>
+  DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  DB  72,131,249,7                        ; cmp           $0x7,%rcx
+  DB  114,181                             ; jb            12ab <_sk_store_f32_avx+0x69>
+  DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  DB  235,171                             ; jmp           12ab <_sk_store_f32_avx+0x69>
+
 PUBLIC _sk_clamp_x_avx
 _sk_clamp_x_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -3405,6 +3493,33 @@
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_store_f32_sse41
+_sk_store_f32_sse41 LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                            ; mov           (%rax),%rax
+  DB  72,137,249                          ; mov           %rdi,%rcx
+  DB  72,193,225,4                        ; shl           $0x4,%rcx
+  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
+  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
+  DB  68,15,20,201                        ; unpcklps      %xmm1,%xmm9
+  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
+  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
+  DB  68,15,20,219                        ; unpcklps      %xmm3,%xmm11
+  DB  68,15,21,193                        ; unpckhps      %xmm1,%xmm8
+  DB  68,15,21,211                        ; unpckhps      %xmm3,%xmm10
+  DB  69,15,40,225                        ; movaps        %xmm9,%xmm12
+  DB  102,69,15,20,227                    ; unpcklpd      %xmm11,%xmm12
+  DB  102,69,15,21,203                    ; unpckhpd      %xmm11,%xmm9
+  DB  69,15,40,216                        ; movaps        %xmm8,%xmm11
+  DB  102,69,15,20,218                    ; unpcklpd      %xmm10,%xmm11
+  DB  102,69,15,21,194                    ; unpckhpd      %xmm10,%xmm8
+  DB  102,68,15,17,36,8                   ; movupd        %xmm12,(%rax,%rcx,1)
+  DB  102,68,15,17,76,8,16                ; movupd        %xmm9,0x10(%rax,%rcx,1)
+  DB  102,68,15,17,92,8,32                ; movupd        %xmm11,0x20(%rax,%rcx,1)
+  DB  102,68,15,17,68,8,48                ; movupd        %xmm8,0x30(%rax,%rcx,1)
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_clamp_x_sse41
 _sk_clamp_x_sse41 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
@@ -4465,6 +4580,33 @@
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
+PUBLIC _sk_store_f32_sse2
+_sk_store_f32_sse2 LABEL PROC
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                            ; mov           (%rax),%rax
+  DB  72,137,249                          ; mov           %rdi,%rcx
+  DB  72,193,225,4                        ; shl           $0x4,%rcx
+  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
+  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
+  DB  68,15,20,201                        ; unpcklps      %xmm1,%xmm9
+  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
+  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
+  DB  68,15,20,219                        ; unpcklps      %xmm3,%xmm11
+  DB  68,15,21,193                        ; unpckhps      %xmm1,%xmm8
+  DB  68,15,21,211                        ; unpckhps      %xmm3,%xmm10
+  DB  69,15,40,225                        ; movaps        %xmm9,%xmm12
+  DB  102,69,15,20,227                    ; unpcklpd      %xmm11,%xmm12
+  DB  102,69,15,21,203                    ; unpckhpd      %xmm11,%xmm9
+  DB  69,15,40,216                        ; movaps        %xmm8,%xmm11
+  DB  102,69,15,20,218                    ; unpcklpd      %xmm10,%xmm11
+  DB  102,69,15,21,194                    ; unpckhpd      %xmm10,%xmm8
+  DB  102,68,15,17,36,8                   ; movupd        %xmm12,(%rax,%rcx,1)
+  DB  102,68,15,17,76,8,16                ; movupd        %xmm9,0x10(%rax,%rcx,1)
+  DB  102,68,15,17,92,8,32                ; movupd        %xmm11,0x20(%rax,%rcx,1)
+  DB  102,68,15,17,68,8,48                ; movupd        %xmm8,0x30(%rax,%rcx,1)
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  255,224                             ; jmpq          *%rax
+
 PUBLIC _sk_clamp_x_sse2
 _sk_clamp_x_sse2 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 2f9c60a..0147408 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -1000,6 +1000,57 @@
 #endif
 }
 
+STAGE(store_f32) {
+    auto ptr = *(float**)ctx + 4*x;
+
+#if !defined(JUMPER)
+    ptr[0] = r;
+    ptr[1] = g;
+    ptr[2] = b;
+    ptr[3] = a;
+#elif defined(__aarch64__)
+    vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
+#elif defined(__arm__)
+    vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
+#elif defined(__AVX__)
+    F rg0145 = _mm256_unpacklo_ps(r, g),  // r0 g0 r1 g1 | r4 g4 r5 g5
+      rg2367 = _mm256_unpackhi_ps(r, g),  // r2 ...      | r6 ...
+      ba0145 = _mm256_unpacklo_ps(b, a),  // b0 a0 b1 a1 | b4 a4 b5 a5
+      ba2367 = _mm256_unpackhi_ps(b, a);  // b2 ...      | b6 ...
+
+    F _04 = _mm256_unpacklo_pd(rg0145, ba0145),  // r0 g0 b0 a0 | r4 g4 b4 a4
+      _15 = _mm256_unpackhi_pd(rg0145, ba0145),  // r1 ...      | r5 ...
+      _26 = _mm256_unpacklo_pd(rg2367, ba2367),  // r2 ...      | r6 ...
+      _37 = _mm256_unpackhi_pd(rg2367, ba2367);  // r3 ...      | r7 ...
+
+    if (__builtin_expect(tail, 0)) {
+        if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
+        if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
+        if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
+        if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
+        if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
+        if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
+        if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
+    } else {
+        F _01 = _mm256_permute2f128_ps(_04, _15, 32),  // 32 == 0010 0000 == lo, lo
+          _23 = _mm256_permute2f128_ps(_26, _37, 32),
+          _45 = _mm256_permute2f128_ps(_04, _15, 49),  // 49 == 0011 0001 == hi, hi
+          _67 = _mm256_permute2f128_ps(_26, _37, 49);
+        _mm256_storeu_ps(ptr+ 0, _01);
+        _mm256_storeu_ps(ptr+ 8, _23);
+        _mm256_storeu_ps(ptr+16, _45);
+        _mm256_storeu_ps(ptr+24, _67);
+    }
+#elif defined(__SSE2__)
+    auto v0 = r, v1 = g, v2 = b, v3 = a;
+    _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
+    memcpy(ptr+ 0, &v0, sizeof(v0));
+    memcpy(ptr+ 4, &v1, sizeof(v1));
+    memcpy(ptr+ 8, &v2, sizeof(v2));
+    memcpy(ptr+12, &v3, sizeof(v3));
+#endif
+}
+
 static F ulp_before(F v) {
     return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
 }