GEMM aarch64 microkernels use LDP to fetch param and cn_stride

PiperOrigin-RevId: 307441932
diff --git a/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in b/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
index 348f970..6e4e142 100644
--- a/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
+++ b/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
@@ -54,13 +54,22 @@
 # C   v26
 # C   v28
 # C   v30
-# Clamp v6 v7
+# Clamp v6, (v4), (v5)
 # unused A   v8 v9 v10 v11
 # unused B   v12 v13 v14 v15
 
-# Clamp v6, (v4), (v5)
 
 BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
+
+        $if INC:
+          # Load cn_stride, acc
+          LDP x14, x15, [sp]
+          # Load params pointer
+          LDR x8, [sp, 16]
+        $else:
+          # Load cn_stride, params pointer
+          LDP x14, x8, [sp]
+
         # Clamp A and C pointers
         CMP x0, 2                // if mr < 2
         ADD x9, x3, x4           // a1 = a0 + a_stride
@@ -86,13 +95,6 @@
         CSEL x12, x11, x12, LS   //   a4 = a3
         CSEL x13, x18, x13, LS   //   c4 = c3
 
-        $if INC:
-          # Load acc, params pointer
-          LDP x15, x8, [sp, 8]
-        $else:
-          # Load params pointer
-          LDR x8, [sp, 8]
-
         CMP x0, 6                // if mr < 6
         ADD x4, x12, x4          // a5 = a4 + a_stride
         ADD x7, x13, x7          // c5 = c4 + cm_stride
@@ -103,9 +105,6 @@
         LD1R {v6.8h}, [x8]
         ADD x8, x8, 2
 
-        # Load cn_stride
-        LDR x14, [sp]
-
 0:
         $if INC:
           # Load initial accumulators