SkJumper: store_f32
Change-Id: I4bc6d1a8787c540fd1a29274650d34392e56651c
Reviewed-on: https://skia-review.googlesource.com/9223
Reviewed-by: Mike Klein <mtklein@chromium.org>
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
index cf36433..71c8644 100644
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -79,6 +79,7 @@
M(store_8888) \
M(load_f16) \
M(store_f16) \
+ M(store_f32) \
M(matrix_2x3) \
M(matrix_3x4) \
M(matrix_perspective) \
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
index 776710a..34aeab4 100644
--- a/src/jumper/SkJumper_generated.S
+++ b/src/jumper/SkJumper_generated.S
@@ -679,6 +679,16 @@
.long 0x91004021 // add x1, x1, #0x10
.long 0xd61f0060 // br x3
+.globl _sk_store_f32_aarch64
+_sk_store_f32_aarch64:
+ .long 0xf9400028 // ldr x8, [x1]
+ .long 0xf9400108 // ldr x8, [x8]
+ .long 0x8b001108 // add x8, x8, x0, lsl #4
+ .long 0x4c000900 // st4 {v0.4s-v3.4s}, [x8]
+ .long 0xf9400423 // ldr x3, [x1,#8]
+ .long 0x91004021 // add x1, x1, #0x10
+ .long 0xd61f0060 // br x3
+
.globl _sk_clamp_x_aarch64
_sk_clamp_x_aarch64:
.long 0xa8c10c28 // ldp x8, x3, [x1],#16
@@ -1564,6 +1574,17 @@
.long 0xe1a01003 // mov r1, r3
.long 0xe12fff1c // bx ip
+.globl _sk_store_f32_vfp4
+_sk_store_f32_vfp4:
+ .long 0xe5913000 // ldr r3, [r1]
+ .long 0xe5933000 // ldr r3, [r3]
+ .long 0xe0833200 // add r3, r3, r0, lsl #4
+ .long 0xf403008f // vst4.32 {d0-d3}, [r3]
+ .long 0xe2813008 // add r3, r1, #8
+ .long 0xe591c004 // ldr ip, [r1, #4]
+ .long 0xe1a01003 // mov r1, r3
+ .long 0xe12fff1c // bx ip
+
.globl _sk_clamp_x_vfp4
_sk_clamp_x_vfp4:
.long 0xe8911008 // ldm r1, {r3, ip}
@@ -2744,6 +2765,50 @@
.byte 197,121,214,68,248,48 // vmovq %xmm8,0x30(%rax,%rdi,8)
.byte 235,181 // jmp be4 <_sk_store_f16_hsw+0x61>
+.globl _sk_store_f32_hsw
+_sk_store_f32_hsw:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,0 // mov (%rax),%r8
+ .byte 72,141,4,189,0,0,0,0 // lea 0x0(,%rdi,4),%rax
+ .byte 197,124,20,193 // vunpcklps %ymm1,%ymm0,%ymm8
+ .byte 197,124,21,217 // vunpckhps %ymm1,%ymm0,%ymm11
+ .byte 197,108,20,203 // vunpcklps %ymm3,%ymm2,%ymm9
+ .byte 197,108,21,227 // vunpckhps %ymm3,%ymm2,%ymm12
+ .byte 196,65,61,20,209 // vunpcklpd %ymm9,%ymm8,%ymm10
+ .byte 196,65,61,21,201 // vunpckhpd %ymm9,%ymm8,%ymm9
+ .byte 196,65,37,20,196 // vunpcklpd %ymm12,%ymm11,%ymm8
+ .byte 196,65,37,21,220 // vunpckhpd %ymm12,%ymm11,%ymm11
+ .byte 72,133,201 // test %rcx,%rcx
+ .byte 117,55 // jne c9c <_sk_store_f32_hsw+0x6d>
+ .byte 196,67,45,24,225,1 // vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
+ .byte 196,67,61,24,235,1 // vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
+ .byte 196,67,45,6,201,49 // vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
+ .byte 196,67,61,6,195,49 // vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
+ .byte 196,65,125,17,36,128 // vmovupd %ymm12,(%r8,%rax,4)
+ .byte 196,65,125,17,108,128,32 // vmovupd %ymm13,0x20(%r8,%rax,4)
+ .byte 196,65,125,17,76,128,64 // vmovupd %ymm9,0x40(%r8,%rax,4)
+ .byte 196,65,125,17,68,128,96 // vmovupd %ymm8,0x60(%r8,%rax,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,17,20,128 // vmovupd %xmm10,(%r8,%rax,4)
+ .byte 72,131,249,1 // cmp $0x1,%rcx
+ .byte 116,240 // je c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,65,121,17,76,128,16 // vmovupd %xmm9,0x10(%r8,%rax,4)
+ .byte 72,131,249,3 // cmp $0x3,%rcx
+ .byte 114,227 // jb c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,65,121,17,68,128,32 // vmovupd %xmm8,0x20(%r8,%rax,4)
+ .byte 116,218 // je c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,65,121,17,92,128,48 // vmovupd %xmm11,0x30(%r8,%rax,4)
+ .byte 72,131,249,5 // cmp $0x5,%rcx
+ .byte 114,205 // jb c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,67,125,25,84,128,64,1 // vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
+ .byte 116,195 // je c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,67,125,25,76,128,80,1 // vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
+ .byte 72,131,249,7 // cmp $0x7,%rcx
+ .byte 114,181 // jb c98 <_sk_store_f32_hsw+0x69>
+ .byte 196,67,125,25,68,128,96,1 // vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
+ .byte 235,171 // jmp c98 <_sk_store_f32_hsw+0x69>
+
.globl _sk_clamp_x_hsw
_sk_clamp_x_hsw:
.byte 72,173 // lods %ds:(%rsi),%rax
@@ -4158,6 +4223,50 @@
.byte 197,121,214,68,248,48 // vmovq %xmm8,0x30(%rax,%rdi,8)
.byte 235,181 // jmp 115f <_sk_store_f16_avx+0xbf>
+.globl _sk_store_f32_avx
+_sk_store_f32_avx:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 76,139,0 // mov (%rax),%r8
+ .byte 72,141,4,189,0,0,0,0 // lea 0x0(,%rdi,4),%rax
+ .byte 197,124,20,193 // vunpcklps %ymm1,%ymm0,%ymm8
+ .byte 197,124,21,217 // vunpckhps %ymm1,%ymm0,%ymm11
+ .byte 197,108,20,203 // vunpcklps %ymm3,%ymm2,%ymm9
+ .byte 197,108,21,227 // vunpckhps %ymm3,%ymm2,%ymm12
+ .byte 196,65,61,20,209 // vunpcklpd %ymm9,%ymm8,%ymm10
+ .byte 196,65,61,21,201 // vunpckhpd %ymm9,%ymm8,%ymm9
+ .byte 196,65,37,20,196 // vunpcklpd %ymm12,%ymm11,%ymm8
+ .byte 196,65,37,21,220 // vunpckhpd %ymm12,%ymm11,%ymm11
+ .byte 72,133,201 // test %rcx,%rcx
+ .byte 117,55 // jne 1217 <_sk_store_f32_avx+0x6d>
+ .byte 196,67,45,24,225,1 // vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
+ .byte 196,67,61,24,235,1 // vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
+ .byte 196,67,45,6,201,49 // vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
+ .byte 196,67,61,6,195,49 // vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
+ .byte 196,65,125,17,36,128 // vmovupd %ymm12,(%r8,%rax,4)
+ .byte 196,65,125,17,108,128,32 // vmovupd %ymm13,0x20(%r8,%rax,4)
+ .byte 196,65,125,17,76,128,64 // vmovupd %ymm9,0x40(%r8,%rax,4)
+ .byte 196,65,125,17,68,128,96 // vmovupd %ymm8,0x60(%r8,%rax,4)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+ .byte 196,65,121,17,20,128 // vmovupd %xmm10,(%r8,%rax,4)
+ .byte 72,131,249,1 // cmp $0x1,%rcx
+ .byte 116,240 // je 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,65,121,17,76,128,16 // vmovupd %xmm9,0x10(%r8,%rax,4)
+ .byte 72,131,249,3 // cmp $0x3,%rcx
+ .byte 114,227 // jb 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,65,121,17,68,128,32 // vmovupd %xmm8,0x20(%r8,%rax,4)
+ .byte 116,218 // je 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,65,121,17,92,128,48 // vmovupd %xmm11,0x30(%r8,%rax,4)
+ .byte 72,131,249,5 // cmp $0x5,%rcx
+ .byte 114,205 // jb 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,67,125,25,84,128,64,1 // vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
+ .byte 116,195 // je 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,67,125,25,76,128,80,1 // vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
+ .byte 72,131,249,7 // cmp $0x7,%rcx
+ .byte 114,181 // jb 1213 <_sk_store_f32_avx+0x69>
+ .byte 196,67,125,25,68,128,96,1 // vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
+ .byte 235,171 // jmp 1213 <_sk_store_f32_avx+0x69>
+
.globl _sk_clamp_x_avx
_sk_clamp_x_avx:
.byte 72,173 // lods %ds:(%rsi),%rax
@@ -5137,6 +5246,33 @@
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
+.globl _sk_store_f32_sse41
+_sk_store_f32_sse41:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 72,139,0 // mov (%rax),%rax
+ .byte 72,137,249 // mov %rdi,%rcx
+ .byte 72,193,225,4 // shl $0x4,%rcx
+ .byte 68,15,40,192 // movaps %xmm0,%xmm8
+ .byte 68,15,40,200 // movaps %xmm0,%xmm9
+ .byte 68,15,20,201 // unpcklps %xmm1,%xmm9
+ .byte 68,15,40,210 // movaps %xmm2,%xmm10
+ .byte 68,15,40,218 // movaps %xmm2,%xmm11
+ .byte 68,15,20,219 // unpcklps %xmm3,%xmm11
+ .byte 68,15,21,193 // unpckhps %xmm1,%xmm8
+ .byte 68,15,21,211 // unpckhps %xmm3,%xmm10
+ .byte 69,15,40,225 // movaps %xmm9,%xmm12
+ .byte 102,69,15,20,227 // unpcklpd %xmm11,%xmm12
+ .byte 102,69,15,21,203 // unpckhpd %xmm11,%xmm9
+ .byte 69,15,40,216 // movaps %xmm8,%xmm11
+ .byte 102,69,15,20,218 // unpcklpd %xmm10,%xmm11
+ .byte 102,69,15,21,194 // unpckhpd %xmm10,%xmm8
+ .byte 102,68,15,17,36,8 // movupd %xmm12,(%rax,%rcx,1)
+ .byte 102,68,15,17,76,8,16 // movupd %xmm9,0x10(%rax,%rcx,1)
+ .byte 102,68,15,17,92,8,32 // movupd %xmm11,0x20(%rax,%rcx,1)
+ .byte 102,68,15,17,68,8,48 // movupd %xmm8,0x30(%rax,%rcx,1)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
.globl _sk_clamp_x_sse41
_sk_clamp_x_sse41:
.byte 72,173 // lods %ds:(%rsi),%rax
@@ -6170,6 +6306,33 @@
.byte 72,173 // lods %ds:(%rsi),%rax
.byte 255,224 // jmpq *%rax
+.globl _sk_store_f32_sse2
+_sk_store_f32_sse2:
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 72,139,0 // mov (%rax),%rax
+ .byte 72,137,249 // mov %rdi,%rcx
+ .byte 72,193,225,4 // shl $0x4,%rcx
+ .byte 68,15,40,192 // movaps %xmm0,%xmm8
+ .byte 68,15,40,200 // movaps %xmm0,%xmm9
+ .byte 68,15,20,201 // unpcklps %xmm1,%xmm9
+ .byte 68,15,40,210 // movaps %xmm2,%xmm10
+ .byte 68,15,40,218 // movaps %xmm2,%xmm11
+ .byte 68,15,20,219 // unpcklps %xmm3,%xmm11
+ .byte 68,15,21,193 // unpckhps %xmm1,%xmm8
+ .byte 68,15,21,211 // unpckhps %xmm3,%xmm10
+ .byte 69,15,40,225 // movaps %xmm9,%xmm12
+ .byte 102,69,15,20,227 // unpcklpd %xmm11,%xmm12
+ .byte 102,69,15,21,203 // unpckhpd %xmm11,%xmm9
+ .byte 69,15,40,216 // movaps %xmm8,%xmm11
+ .byte 102,69,15,20,218 // unpcklpd %xmm10,%xmm11
+ .byte 102,69,15,21,194 // unpckhpd %xmm10,%xmm8
+ .byte 102,68,15,17,36,8 // movupd %xmm12,(%rax,%rcx,1)
+ .byte 102,68,15,17,76,8,16 // movupd %xmm9,0x10(%rax,%rcx,1)
+ .byte 102,68,15,17,92,8,32 // movupd %xmm11,0x20(%rax,%rcx,1)
+ .byte 102,68,15,17,68,8,48 // movupd %xmm8,0x30(%rax,%rcx,1)
+ .byte 72,173 // lods %ds:(%rsi),%rax
+ .byte 255,224 // jmpq *%rax
+
.globl _sk_clamp_x_sse2
_sk_clamp_x_sse2:
.byte 72,173 // lods %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
index d84a253..6afcfca 100644
--- a/src/jumper/SkJumper_generated_win.S
+++ b/src/jumper/SkJumper_generated_win.S
@@ -958,6 +958,50 @@
DB 197,121,214,68,248,48 ; vmovq %xmm8,0x30(%rax,%rdi,8)
DB 235,181 ; jmp c7c <_sk_store_f16_hsw+0x61>
+PUBLIC _sk_store_f32_hsw
+_sk_store_f32_hsw LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,0 ; mov (%rax),%r8
+ DB 72,141,4,189,0,0,0,0 ; lea 0x0(,%rdi,4),%rax
+ DB 197,124,20,193 ; vunpcklps %ymm1,%ymm0,%ymm8
+ DB 197,124,21,217 ; vunpckhps %ymm1,%ymm0,%ymm11
+ DB 197,108,20,203 ; vunpcklps %ymm3,%ymm2,%ymm9
+ DB 197,108,21,227 ; vunpckhps %ymm3,%ymm2,%ymm12
+ DB 196,65,61,20,209 ; vunpcklpd %ymm9,%ymm8,%ymm10
+ DB 196,65,61,21,201 ; vunpckhpd %ymm9,%ymm8,%ymm9
+ DB 196,65,37,20,196 ; vunpcklpd %ymm12,%ymm11,%ymm8
+ DB 196,65,37,21,220 ; vunpckhpd %ymm12,%ymm11,%ymm11
+ DB 72,133,201 ; test %rcx,%rcx
+ DB 117,55 ; jne d34 <_sk_store_f32_hsw+0x6d>
+ DB 196,67,45,24,225,1 ; vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
+ DB 196,67,61,24,235,1 ; vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
+ DB 196,67,45,6,201,49 ; vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
+ DB 196,67,61,6,195,49 ; vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
+ DB 196,65,125,17,36,128 ; vmovupd %ymm12,(%r8,%rax,4)
+ DB 196,65,125,17,108,128,32 ; vmovupd %ymm13,0x20(%r8,%rax,4)
+ DB 196,65,125,17,76,128,64 ; vmovupd %ymm9,0x40(%r8,%rax,4)
+ DB 196,65,125,17,68,128,96 ; vmovupd %ymm8,0x60(%r8,%rax,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,17,20,128 ; vmovupd %xmm10,(%r8,%rax,4)
+ DB 72,131,249,1 ; cmp $0x1,%rcx
+ DB 116,240 ; je d30 <_sk_store_f32_hsw+0x69>
+ DB 196,65,121,17,76,128,16 ; vmovupd %xmm9,0x10(%r8,%rax,4)
+ DB 72,131,249,3 ; cmp $0x3,%rcx
+ DB 114,227 ; jb d30 <_sk_store_f32_hsw+0x69>
+ DB 196,65,121,17,68,128,32 ; vmovupd %xmm8,0x20(%r8,%rax,4)
+ DB 116,218 ; je d30 <_sk_store_f32_hsw+0x69>
+ DB 196,65,121,17,92,128,48 ; vmovupd %xmm11,0x30(%r8,%rax,4)
+ DB 72,131,249,5 ; cmp $0x5,%rcx
+ DB 114,205 ; jb d30 <_sk_store_f32_hsw+0x69>
+ DB 196,67,125,25,84,128,64,1 ; vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
+ DB 116,195 ; je d30 <_sk_store_f32_hsw+0x69>
+ DB 196,67,125,25,76,128,80,1 ; vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
+ DB 72,131,249,7 ; cmp $0x7,%rcx
+ DB 114,181 ; jb d30 <_sk_store_f32_hsw+0x69>
+ DB 196,67,125,25,68,128,96,1 ; vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
+ DB 235,171 ; jmp d30 <_sk_store_f32_hsw+0x69>
+
PUBLIC _sk_clamp_x_hsw
_sk_clamp_x_hsw LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -2399,6 +2443,50 @@
DB 197,121,214,68,248,48 ; vmovq %xmm8,0x30(%rax,%rdi,8)
DB 235,181 ; jmp 11f7 <_sk_store_f16_avx+0xbf>
+PUBLIC _sk_store_f32_avx
+_sk_store_f32_avx LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 76,139,0 ; mov (%rax),%r8
+ DB 72,141,4,189,0,0,0,0 ; lea 0x0(,%rdi,4),%rax
+ DB 197,124,20,193 ; vunpcklps %ymm1,%ymm0,%ymm8
+ DB 197,124,21,217 ; vunpckhps %ymm1,%ymm0,%ymm11
+ DB 197,108,20,203 ; vunpcklps %ymm3,%ymm2,%ymm9
+ DB 197,108,21,227 ; vunpckhps %ymm3,%ymm2,%ymm12
+ DB 196,65,61,20,209 ; vunpcklpd %ymm9,%ymm8,%ymm10
+ DB 196,65,61,21,201 ; vunpckhpd %ymm9,%ymm8,%ymm9
+ DB 196,65,37,20,196 ; vunpcklpd %ymm12,%ymm11,%ymm8
+ DB 196,65,37,21,220 ; vunpckhpd %ymm12,%ymm11,%ymm11
+ DB 72,133,201 ; test %rcx,%rcx
+ DB 117,55 ; jne 12af <_sk_store_f32_avx+0x6d>
+ DB 196,67,45,24,225,1 ; vinsertf128 $0x1,%xmm9,%ymm10,%ymm12
+ DB 196,67,61,24,235,1 ; vinsertf128 $0x1,%xmm11,%ymm8,%ymm13
+ DB 196,67,45,6,201,49 ; vperm2f128 $0x31,%ymm9,%ymm10,%ymm9
+ DB 196,67,61,6,195,49 ; vperm2f128 $0x31,%ymm11,%ymm8,%ymm8
+ DB 196,65,125,17,36,128 ; vmovupd %ymm12,(%r8,%rax,4)
+ DB 196,65,125,17,108,128,32 ; vmovupd %ymm13,0x20(%r8,%rax,4)
+ DB 196,65,125,17,76,128,64 ; vmovupd %ymm9,0x40(%r8,%rax,4)
+ DB 196,65,125,17,68,128,96 ; vmovupd %ymm8,0x60(%r8,%rax,4)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+ DB 196,65,121,17,20,128 ; vmovupd %xmm10,(%r8,%rax,4)
+ DB 72,131,249,1 ; cmp $0x1,%rcx
+ DB 116,240 ; je 12ab <_sk_store_f32_avx+0x69>
+ DB 196,65,121,17,76,128,16 ; vmovupd %xmm9,0x10(%r8,%rax,4)
+ DB 72,131,249,3 ; cmp $0x3,%rcx
+ DB 114,227 ; jb 12ab <_sk_store_f32_avx+0x69>
+ DB 196,65,121,17,68,128,32 ; vmovupd %xmm8,0x20(%r8,%rax,4)
+ DB 116,218 ; je 12ab <_sk_store_f32_avx+0x69>
+ DB 196,65,121,17,92,128,48 ; vmovupd %xmm11,0x30(%r8,%rax,4)
+ DB 72,131,249,5 ; cmp $0x5,%rcx
+ DB 114,205 ; jb 12ab <_sk_store_f32_avx+0x69>
+ DB 196,67,125,25,84,128,64,1 ; vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4)
+ DB 116,195 ; je 12ab <_sk_store_f32_avx+0x69>
+ DB 196,67,125,25,76,128,80,1 ; vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4)
+ DB 72,131,249,7 ; cmp $0x7,%rcx
+ DB 114,181 ; jb 12ab <_sk_store_f32_avx+0x69>
+ DB 196,67,125,25,68,128,96,1 ; vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4)
+ DB 235,171 ; jmp 12ab <_sk_store_f32_avx+0x69>
+
PUBLIC _sk_clamp_x_avx
_sk_clamp_x_avx LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -3405,6 +3493,33 @@
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_store_f32_sse41
+_sk_store_f32_sse41 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 72,137,249 ; mov %rdi,%rcx
+ DB 72,193,225,4 ; shl $0x4,%rcx
+ DB 68,15,40,192 ; movaps %xmm0,%xmm8
+ DB 68,15,40,200 ; movaps %xmm0,%xmm9
+ DB 68,15,20,201 ; unpcklps %xmm1,%xmm9
+ DB 68,15,40,210 ; movaps %xmm2,%xmm10
+ DB 68,15,40,218 ; movaps %xmm2,%xmm11
+ DB 68,15,20,219 ; unpcklps %xmm3,%xmm11
+ DB 68,15,21,193 ; unpckhps %xmm1,%xmm8
+ DB 68,15,21,211 ; unpckhps %xmm3,%xmm10
+ DB 69,15,40,225 ; movaps %xmm9,%xmm12
+ DB 102,69,15,20,227 ; unpcklpd %xmm11,%xmm12
+ DB 102,69,15,21,203 ; unpckhpd %xmm11,%xmm9
+ DB 69,15,40,216 ; movaps %xmm8,%xmm11
+ DB 102,69,15,20,218 ; unpcklpd %xmm10,%xmm11
+ DB 102,69,15,21,194 ; unpckhpd %xmm10,%xmm8
+ DB 102,68,15,17,36,8 ; movupd %xmm12,(%rax,%rcx,1)
+ DB 102,68,15,17,76,8,16 ; movupd %xmm9,0x10(%rax,%rcx,1)
+ DB 102,68,15,17,92,8,32 ; movupd %xmm11,0x20(%rax,%rcx,1)
+ DB 102,68,15,17,68,8,48 ; movupd %xmm8,0x30(%rax,%rcx,1)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_clamp_x_sse41
_sk_clamp_x_sse41 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
@@ -4465,6 +4580,33 @@
DB 72,173 ; lods %ds:(%rsi),%rax
DB 255,224 ; jmpq *%rax
+PUBLIC _sk_store_f32_sse2
+_sk_store_f32_sse2 LABEL PROC
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 72,139,0 ; mov (%rax),%rax
+ DB 72,137,249 ; mov %rdi,%rcx
+ DB 72,193,225,4 ; shl $0x4,%rcx
+ DB 68,15,40,192 ; movaps %xmm0,%xmm8
+ DB 68,15,40,200 ; movaps %xmm0,%xmm9
+ DB 68,15,20,201 ; unpcklps %xmm1,%xmm9
+ DB 68,15,40,210 ; movaps %xmm2,%xmm10
+ DB 68,15,40,218 ; movaps %xmm2,%xmm11
+ DB 68,15,20,219 ; unpcklps %xmm3,%xmm11
+ DB 68,15,21,193 ; unpckhps %xmm1,%xmm8
+ DB 68,15,21,211 ; unpckhps %xmm3,%xmm10
+ DB 69,15,40,225 ; movaps %xmm9,%xmm12
+ DB 102,69,15,20,227 ; unpcklpd %xmm11,%xmm12
+ DB 102,69,15,21,203 ; unpckhpd %xmm11,%xmm9
+ DB 69,15,40,216 ; movaps %xmm8,%xmm11
+ DB 102,69,15,20,218 ; unpcklpd %xmm10,%xmm11
+ DB 102,69,15,21,194 ; unpckhpd %xmm10,%xmm8
+ DB 102,68,15,17,36,8 ; movupd %xmm12,(%rax,%rcx,1)
+ DB 102,68,15,17,76,8,16 ; movupd %xmm9,0x10(%rax,%rcx,1)
+ DB 102,68,15,17,92,8,32 ; movupd %xmm11,0x20(%rax,%rcx,1)
+ DB 102,68,15,17,68,8,48 ; movupd %xmm8,0x30(%rax,%rcx,1)
+ DB 72,173 ; lods %ds:(%rsi),%rax
+ DB 255,224 ; jmpq *%rax
+
PUBLIC _sk_clamp_x_sse2
_sk_clamp_x_sse2 LABEL PROC
DB 72,173 ; lods %ds:(%rsi),%rax
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 2f9c60a..0147408 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -1000,6 +1000,57 @@
#endif
}
+STAGE(store_f32) {
+ auto ptr = *(float**)ctx + 4*x;
+
+#if !defined(JUMPER)
+ ptr[0] = r;
+ ptr[1] = g;
+ ptr[2] = b;
+ ptr[3] = a;
+#elif defined(__aarch64__)
+ vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
+#elif defined(__arm__)
+ vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
+#elif defined(__AVX__)
+ F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
+ rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
+ ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
+ ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
+
+ F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4
+ _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ...
+ _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ...
+ _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ...
+
+ if (__builtin_expect(tail, 0)) {
+ if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
+ if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
+ if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
+ if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
+ if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
+ if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
+ if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
+ } else {
+ F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
+ _23 = _mm256_permute2f128_ps(_26, _37, 32),
+ _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
+ _67 = _mm256_permute2f128_ps(_26, _37, 49);
+ _mm256_storeu_ps(ptr+ 0, _01);
+ _mm256_storeu_ps(ptr+ 8, _23);
+ _mm256_storeu_ps(ptr+16, _45);
+ _mm256_storeu_ps(ptr+24, _67);
+ }
+#elif defined(__SSE2__)
+ auto v0 = r, v1 = g, v2 = b, v3 = a;
+ _MM_TRANSPOSE4_PS(v0, v1, v2, v3);
+ memcpy(ptr+ 0, &v0, sizeof(v0));
+ memcpy(ptr+ 4, &v1, sizeof(v1));
+ memcpy(ptr+ 8, &v2, sizeof(v2));
+ memcpy(ptr+12, &v3, sizeof(v3));
+#endif
+}
+
static F ulp_before(F v) {
return bit_cast<F>(bit_cast<U32>(v) + U32(0xffffffff));
}