ST1 post increment for ld64/ld128 GEMM/IGEMM microkernels
Simplifies reference kernels. Performance is about the same on most CPUs.
PiperOrigin-RevId: 280032091
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
index 3a75826..5205a13 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
@@ -225,42 +225,30 @@
B.LO 5f
$if INC:
- STP q30, q31, [x7]
- ADD x7, x7, x14
+ ST1 {v30.16b, v31.16b}, [x7], x14
SUB x3, x3, x2 // a0 -= kc
- STP q28, q29, [x13]
- ADD x13, x13, x14
+ ST1 {v28.16b, v29.16b}, [x13], x14
SUB x9, x9, x2 // a1 -= kc
- STP q26, q27, [x18]
- ADD x18, x18, x14
+ ST1 {v26.16b, v27.16b}, [x18], x14
SUB x10, x10, x2 // a2 -= kc
- STP q24, q25, [x17]
- ADD x17, x17, x14
+ ST1 {v24.16b, v25.16b}, [x17], x14
SUB x11, x11, x2 // a3 -= kc
- STP q22, q23, [x16]
- ADD x16, x16, x14
+ ST1 {v22.16b, v23.16b}, [x16], x14
SUB x12, x12, x2 // a4 -= kc
- STP q20, q21, [x6]
- ADD x6, x6, x14
+ ST1 {v20.16b, v21.16b}, [x6], x14
SUB x4, x4, x2 // a5 -= kc
$else:
- STP q20, q21, [x6]
- ADD x6, x6, x14
+ ST1 {v20.16b, v21.16b}, [x6], x14
SUB x3, x3, x2 // a0 -= kc
- STP q22, q23, [x16]
- ADD x16, x16, x14
+ ST1 {v22.16b, v23.16b}, [x16], x14
SUB x9, x9, x2 // a1 -= kc
- STP q24, q25, [x17]
- ADD x17, x17, x14
+ ST1 {v24.16b, v25.16b}, [x17], x14
SUB x10, x10, x2 // a2 -= kc
- STP q26, q27, [x18]
- ADD x18, x18, x14
+ ST1 {v26.16b, v27.16b}, [x18], x14
SUB x11, x11, x2 // a3 -= kc
- STP q28, q29, [x13]
- ADD x13, x13, x14
+ ST1 {v28.16b, v29.16b}, [x13], x14
SUB x12, x12, x2 // a4 -= kc
- STP q30, q31, [x7]
- ADD x7, x7, x14
+ ST1 {v30.16b, v31.16b}, [x7], x14
SUB x4, x4, x2 // a5 -= kc
SUBS x1, x1, 8