GEMM 4x8 and 4x12 kernels use forward stores for C.

GEMMINC and IGEMM reverse the order of stores, but GEMM
kernels can store forwards for improved performance.

Was
sgemm_4x8__aarch64_neonfma_cortex_a53                    44302611         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   48045419         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    58764775         21
sgemm_4x8__aarch64_neonfma_ld64                          61698465         21
sgemm_4x8__aarch64_neonfma_ld128                         61745255         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    65176514         21

Now
sgemm_4x8__aarch64_neonfma_cortex_a53                    43837171         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   47806009         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    57647481         21
sgemm_4x8__aarch64_neonfma_ld64                          60545561         21
sgemm_4x8__aarch64_neonfma_ld128                         61501531         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    64329484         21

PiperOrigin-RevId: 280745893
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
index 248aeae..d99689b 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
@@ -416,19 +416,32 @@
         CMP x1, 8
         B.LO 7f
 
-        STP q30, q31,  [x7]
-        ADD  x7,  x7, x14
-        STP q28, q29, [x10]
-        ADD x10, x10, x14
-        STP q18, q19,  [x9]
-        ADD  x9,  x9, x14
-        STP q16, q17,  [x6]
-        ADD  x6,  x6, x14
-
-        SUB  x3,  x3, x2 // a0 -= kc
-        SUB x11, x11, x2 // a1 -= kc
-        SUB x12, x12, x2 // a2 -= kc
-        SUB  x4,  x4, x2 // a3 -= kc
+        $if INC:
+          STP q30, q31,  [x7]
+          SUB  x3,  x3, x2 // a0 -= kc
+          ADD  x7,  x7, x14
+          STP q28, q29, [x10]
+          SUB x11, x11, x2 // a1 -= kc
+          ADD x10, x10, x14
+          STP q18, q19,  [x9]
+          SUB x12, x12, x2 // a2 -= kc
+          ADD  x9,  x9, x14
+          STP q16, q17,  [x6]
+          SUB  x4,  x4, x2 // a3 -= kc
+          ADD  x6,  x6, x14
+        $else:
+          STP q16, q17,  [x6]
+          SUB  x3,  x3, x2 // a0 -= kc
+          ADD  x6,  x6, x14
+          STP q18, q19,  [x9]
+          SUB x11, x11, x2 // a1 -= kc
+          ADD  x9,  x9, x14
+          STP q28, q29, [x10]
+          SUB x12, x12, x2 // a2 -= kc
+          ADD x10, x10, x14
+          STP q30, q31,  [x7]
+          SUB  x4,  x4, x2 // a3 -= kc
+          ADD  x7,  x7, x14
 
         SUBS x1, x1, 8
         B.HI 0b
@@ -443,32 +456,58 @@
         # Store odd width
 7:
         TBZ x1, 2, 8f
-        STR q30, [x7], 16
-        MOV v30.16b, v31.16b
-        STR q28, [x10], 16
-        MOV v28.16b, v29.16b
-        STR q18, [x9], 16
-        MOV v18.16b, v19.16b
-        STR q16, [x6], 16
-        MOV v16.16b, v17.16b
+        $if INC:
+          STR q30, [x7], 16
+          MOV v30.16b, v31.16b
+          STR q28, [x10], 16
+          MOV v28.16b, v29.16b
+          STR q18, [x9], 16
+          MOV v18.16b, v19.16b
+          STR q16, [x6], 16
+          MOV v16.16b, v17.16b
+        $else:
+          STR q16, [x6], 16
+          MOV v16.16b, v17.16b
+          STR q18, [x9], 16
+          MOV v18.16b, v19.16b
+          STR q28, [x10], 16
+          MOV v28.16b, v29.16b
+          STR q30, [x7], 16
+          MOV v30.16b, v31.16b
 
 8:
         TBZ x1, 1, 9f
-        STR d30, [x7], 8
-        DUP d30, v30.d[1]
-        STR d28, [x10], 8
-        DUP d28, v28.d[1]
-        STR d18, [x9], 8
-        DUP d18, v18.d[1]
-        STR d16, [x6], 8
-        DUP d16, v16.d[1]
+        $if INC:
+          STR d30, [x7], 8
+          DUP d30, v30.d[1]
+          STR d28, [x10], 8
+          DUP d28, v28.d[1]
+          STR d18, [x9], 8
+          DUP d18, v18.d[1]
+          STR d16, [x6], 8
+          DUP d16, v16.d[1]
+        $else:
+          STR d16, [x6], 8
+          DUP d16, v16.d[1]
+          STR d18, [x9], 8
+          DUP d18, v18.d[1]
+          STR d28, [x10], 8
+          DUP d28, v28.d[1]
+          STR d30, [x7], 8
+          DUP d30, v30.d[1]
 
 9:
         TBZ x1, 0, 10f
-        STR s30,  [x7]
-        STR s28, [x10]
-        STR s18,  [x9]
-        STR s16,  [x6]
+        $if INC:
+          STR s30,  [x7]
+          STR s28, [x10]
+          STR s18,  [x9]
+          STR s16,  [x6]
+        $else:
+          STR s16,  [x6]
+          STR s18,  [x9]
+          STR s28, [x10]
+          STR s30,  [x7]
 10:
         # Restore d8-d15 from stack
         LDP d14, d15, [sp, 48]
@@ -477,6 +516,7 @@
         LDP  d8,  d9, [sp], 64
         RET
 
+
 END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a75
 
 #ifdef __ELF__