ST1 post increment for ld64/ld128 GEMM/IGEMM microkernels

Simplifies reference kernels.  Performance is about the same on most CPUs.

PiperOrigin-RevId: 280032091
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
index 3a75826..5205a13 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
@@ -225,42 +225,30 @@
         B.LO 5f
 
         $if INC:
-          STP q30, q31,  [x7]
-          ADD x7, x7, x14
+          ST1 {v30.16b, v31.16b},  [x7], x14
           SUB  x3,  x3, x2 // a0 -= kc
-          STP q28, q29, [x13]
-          ADD x13, x13, x14
+          ST1 {v28.16b, v29.16b}, [x13], x14
           SUB  x9,  x9, x2 // a1 -= kc
-          STP q26, q27, [x18]
-          ADD x18, x18, x14
+          ST1 {v26.16b, v27.16b}, [x18], x14
           SUB x10, x10, x2 // a2 -= kc
-          STP q24, q25, [x17]
-          ADD x17, x17, x14
+          ST1 {v24.16b, v25.16b}, [x17], x14
           SUB x11, x11, x2 // a3 -= kc
-          STP q22, q23, [x16]
-          ADD x16, x16, x14
+          ST1 {v22.16b, v23.16b}, [x16], x14
           SUB x12, x12, x2 // a4 -= kc
-          STP q20, q21,  [x6]
-          ADD  x6,  x6, x14
+          ST1 {v20.16b, v21.16b},  [x6], x14
           SUB  x4,  x4, x2 // a5 -= kc
         $else:
-          STP q20, q21,  [x6]
-          ADD  x6,  x6, x14
+          ST1 {v20.16b, v21.16b},  [x6], x14
           SUB  x3,  x3, x2 // a0 -= kc
-          STP q22, q23, [x16]
-          ADD x16, x16, x14
+          ST1 {v22.16b, v23.16b}, [x16], x14
           SUB  x9,  x9, x2 // a1 -= kc
-          STP q24, q25, [x17]
-          ADD x17, x17, x14
+          ST1 {v24.16b, v25.16b}, [x17], x14
           SUB x10, x10, x2 // a2 -= kc
-          STP q26, q27, [x18]
-          ADD x18, x18, x14
+          ST1 {v26.16b, v27.16b}, [x18], x14
           SUB x11, x11, x2 // a3 -= kc
-          STP q28, q29, [x13]
-          ADD x13, x13, x14
+          ST1 {v28.16b, v29.16b}, [x13], x14
           SUB x12, x12, x2 // a4 -= kc
-          STP q30, q31,  [x7]
-          ADD x7, x7, x14
+          ST1 {v30.16b, v31.16b},  [x7], x14
           SUB  x4,  x4, x2 // a5 -= kc
 
         SUBS x1, x1, 8