6x8 A53 GEMM use prefetch.

20.88% faster than previous 6x8.  10.6% faster than 4x12.

Switch cpu detect for A53 to select 6x8 by default.

PiperOrigin-RevId: 277602956
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
index afef2dd..617b7cc 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
@@ -108,30 +108,41 @@
         # Load initial bias from w into accumulators
         LDP q20, q21, [x5], 32
         MOV v22.16b, v20.16b
-        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
+        PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
+        PRFM PLDL1KEEP,  [x3, 64]
         MOV v23.16b, v21.16b
-        PRFM PLDL1KEEP, [x5, 64]
+        PRFM PLDL1KEEP,  [x9,  0]
+        PRFM PLDL1KEEP,  [x9, 64]
         MOV v24.16b, v20.16b
-        PRFM PLDL1KEEP, [x5, 128]
+        PRFM PLDL1KEEP, [x10,  0]
+        PRFM PLDL1KEEP, [x10, 64]
         MOV v25.16b, v21.16b
-        PRFM PLDL1KEEP, [x5, 192]
+        PRFM PLDL1KEEP, [x11,  0]
+        PRFM PLDL1KEEP, [x11, 64]
         MOV v26.16b, v20.16b
-        PRFM PLDL1KEEP,  [x3]    // Prefetch A
+        PRFM PLDL1KEEP, [x12,  0]
+        PRFM PLDL1KEEP, [x12, 64]
         MOV v27.16b, v21.16b
-        PRFM PLDL1KEEP,  [x9]
+        PRFM PLDL1KEEP,  [x4,  0]
+        PRFM PLDL1KEEP,  [x4, 64]
         MOV v28.16b, v20.16b
-        PRFM PLDL1KEEP, [x10]
+        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+        PRFM PLDL1KEEP, [x5,  64]
         MOV v29.16b, v21.16b
-        PRFM PLDL1KEEP, [x11]
+        PRFM PLDL1KEEP, [x5, 128]
+        PRFM PLDL1KEEP, [x5, 192]
         MOV v30.16b, v20.16b
-        PRFM PLDL1KEEP, [x12]
+        PRFM PLDL1KEEP, [x5, 256]
+        PRFM PLDL1KEEP, [x5, 320]
         MOV v31.16b, v21.16b
-        PRFM PLDL1KEEP,  [x4]
 
         # Is there at least 4 floats (16 bytes) for main loop?
         SUBS x0, x2, 16  // k = kc - 16
         B.LO 2f
 
+        # Additional prefetches for main loop
+
+
         # Main loop - 2 floats of A (8 bytes)
         # 48 FMA + 12 LD64 A + 4 LDP B
 1:
@@ -176,15 +187,23 @@
         FMLA v28.4s, v18.4s,  v2.s[1]
         LD1   {v5.d}[1], [x4], 8
         FMLA v30.4s, v18.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x3, 128]  // Prefetch A0
         FMLA v21.4s, v19.4s,  v0.s[1]
+        PRFM PLDL1KEEP, [x9, 128]  // Prefetch A1
         FMLA v23.4s, v19.4s,  v0.s[3]
+        PRFM PLDL1KEEP, [x10, 128]
         FMLA v25.4s, v19.4s,  v1.s[1]
+        PRFM PLDL1KEEP, [x11, 128]
         FMLA v27.4s, v19.4s,  v1.s[3]
+        PRFM PLDL1KEEP, [x12, 128]
         FMLA v29.4s, v19.4s,  v2.s[1]
+        PRFM PLDL1KEEP,  [x4, 128]
         FMLA v31.4s, v19.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x5, 256]  // Prefetch B
 
         # Second group of 24 FMA
         FMLA v20.4s, v12.4s,  v3.s[0]
+        PRFM PLDL1KEEP, [x5, 320]  // Prefetch B
         FMLA v22.4s, v12.4s,  v3.s[2]
         FMLA v24.4s, v12.4s,  v4.s[0]
         FMLA v26.4s, v12.4s,  v4.s[2]
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
index c697c68..3942a30 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
@@ -117,39 +117,55 @@
           LDP q26, q27, [x15], 32
           LDP q28, q29, [x15], 32
           LDP q30, q31, [x15], 32
-          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
-          PRFM PLDL1KEEP, [x5, 64]
+          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
+          PRFM PLDL1KEEP,  [x3, 64]
+          PRFM PLDL1KEEP,  [x9,  0]
+          PRFM PLDL1KEEP,  [x9, 64]
+          PRFM PLDL1KEEP, [x10,  0]
+          PRFM PLDL1KEEP, [x10, 64]
+          PRFM PLDL1KEEP, [x11,  0]
+          PRFM PLDL1KEEP, [x11, 64]
+          PRFM PLDL1KEEP, [x12,  0]
+          PRFM PLDL1KEEP, [x12, 64]
+          PRFM PLDL1KEEP,  [x4,  0]
+          PRFM PLDL1KEEP,  [x4, 64]
+          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+          PRFM PLDL1KEEP, [x5,  64]
           PRFM PLDL1KEEP, [x5, 128]
           PRFM PLDL1KEEP, [x5, 192]
-          PRFM PLDL1KEEP,  [x3]    // Prefetch A
-          PRFM PLDL1KEEP,  [x9]
-          PRFM PLDL1KEEP, [x10]
-          PRFM PLDL1KEEP, [x11]
-          PRFM PLDL1KEEP, [x12]
-          PRFM PLDL1KEEP,  [x4]
+          PRFM PLDL1KEEP, [x5, 256]
+          PRFM PLDL1KEEP, [x5, 320]
         $else:
           # Load initial bias from w into accumulators
           LDP q20, q21, [x5], 32
           MOV v22.16b, v20.16b
-          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
+          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
+          PRFM PLDL1KEEP,  [x3, 64]
           MOV v23.16b, v21.16b
-          PRFM PLDL1KEEP, [x5, 64]
+          PRFM PLDL1KEEP,  [x9,  0]
+          PRFM PLDL1KEEP,  [x9, 64]
           MOV v24.16b, v20.16b
-          PRFM PLDL1KEEP, [x5, 128]
+          PRFM PLDL1KEEP, [x10,  0]
+          PRFM PLDL1KEEP, [x10, 64]
           MOV v25.16b, v21.16b
-          PRFM PLDL1KEEP, [x5, 192]
+          PRFM PLDL1KEEP, [x11,  0]
+          PRFM PLDL1KEEP, [x11, 64]
           MOV v26.16b, v20.16b
-          PRFM PLDL1KEEP,  [x3]    // Prefetch A
+          PRFM PLDL1KEEP, [x12,  0]
+          PRFM PLDL1KEEP, [x12, 64]
           MOV v27.16b, v21.16b
-          PRFM PLDL1KEEP,  [x9]
+          PRFM PLDL1KEEP,  [x4,  0]
+          PRFM PLDL1KEEP,  [x4, 64]
           MOV v28.16b, v20.16b
-          PRFM PLDL1KEEP, [x10]
+          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+          PRFM PLDL1KEEP, [x5,  64]
           MOV v29.16b, v21.16b
-          PRFM PLDL1KEEP, [x11]
+          PRFM PLDL1KEEP, [x5, 128]
+          PRFM PLDL1KEEP, [x5, 192]
           MOV v30.16b, v20.16b
-          PRFM PLDL1KEEP, [x12]
+          PRFM PLDL1KEEP, [x5, 256]
+          PRFM PLDL1KEEP, [x5, 320]
           MOV v31.16b, v21.16b
-          PRFM PLDL1KEEP,  [x4]
 
         # Is there at least 4 floats (16 bytes) for main loop?
         SUBS x0, x2, 16  // k = kc - 16
@@ -199,15 +215,23 @@
         FMLA v28.4s, v18.4s,  v2.s[1]
         LD1   {v5.d}[1], [x4], 8
         FMLA v30.4s, v18.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x3, 128]  // Prefetch A0
         FMLA v21.4s, v19.4s,  v0.s[1]
+        PRFM PLDL1KEEP, [x9, 128]  // Prefetch A1
         FMLA v23.4s, v19.4s,  v0.s[3]
+        PRFM PLDL1KEEP, [x10, 128]
         FMLA v25.4s, v19.4s,  v1.s[1]
+        PRFM PLDL1KEEP, [x11, 128]
         FMLA v27.4s, v19.4s,  v1.s[3]
+        PRFM PLDL1KEEP, [x12, 128]
         FMLA v29.4s, v19.4s,  v2.s[1]
+        PRFM PLDL1KEEP,  [x4, 128]
         FMLA v31.4s, v19.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x5, 256]  // Prefetch B
 
         # Second group of 24 FMA
         FMLA v20.4s, v12.4s,  v3.s[0]
+        PRFM PLDL1KEEP, [x5, 320]  // Prefetch B
         FMLA v22.4s, v12.4s,  v3.s[2]
         FMLA v24.4s, v12.4s,  v4.s[0]
         FMLA v26.4s, v12.4s,  v4.s[2]
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
index 32f33d2..14df321 100644
--- a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
@@ -57,7 +57,7 @@
 # C   v28 v29
 # C   v30 v31
 # Clamp v6 v7
-# unused A   v3, v4, v5, v8 v9 v10 v11
+# unused A   v8 v9 v10 v11
 
 BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53
 
@@ -113,21 +113,32 @@
         LDP q26, q27, [x15], 32
         LDP q28, q29, [x15], 32
         LDP q30, q31, [x15], 32
-        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
-        PRFM PLDL1KEEP, [x5, 64]
+        PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
+        PRFM PLDL1KEEP,  [x3, 64]
+        PRFM PLDL1KEEP,  [x9,  0]
+        PRFM PLDL1KEEP,  [x9, 64]
+        PRFM PLDL1KEEP, [x10,  0]
+        PRFM PLDL1KEEP, [x10, 64]
+        PRFM PLDL1KEEP, [x11,  0]
+        PRFM PLDL1KEEP, [x11, 64]
+        PRFM PLDL1KEEP, [x12,  0]
+        PRFM PLDL1KEEP, [x12, 64]
+        PRFM PLDL1KEEP,  [x4,  0]
+        PRFM PLDL1KEEP,  [x4, 64]
+        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+        PRFM PLDL1KEEP, [x5,  64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP,  [x3]    // Prefetch A
-        PRFM PLDL1KEEP,  [x9]
-        PRFM PLDL1KEEP, [x10]
-        PRFM PLDL1KEEP, [x11]
-        PRFM PLDL1KEEP, [x12]
-        PRFM PLDL1KEEP,  [x4]
+        PRFM PLDL1KEEP, [x5, 256]
+        PRFM PLDL1KEEP, [x5, 320]
 
         # Is there at least 4 floats (16 bytes) for main loop?
         SUBS x0, x2, 16  // k = kc - 16
         B.LO 2f
 
+        # Additional prefetches for main loop
+
+
         # Main loop - 2 floats of A (8 bytes)
         # 48 FMA + 12 LD64 A + 4 LDP B
 1:
@@ -172,15 +183,23 @@
         FMLA v28.4s, v18.4s,  v2.s[1]
         LD1   {v5.d}[1], [x4], 8
         FMLA v30.4s, v18.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x3, 128]  // Prefetch A0
         FMLA v21.4s, v19.4s,  v0.s[1]
+        PRFM PLDL1KEEP, [x9, 128]  // Prefetch A1
         FMLA v23.4s, v19.4s,  v0.s[3]
+        PRFM PLDL1KEEP, [x10, 128]
         FMLA v25.4s, v19.4s,  v1.s[1]
+        PRFM PLDL1KEEP, [x11, 128]
         FMLA v27.4s, v19.4s,  v1.s[3]
+        PRFM PLDL1KEEP, [x12, 128]
         FMLA v29.4s, v19.4s,  v2.s[1]
+        PRFM PLDL1KEEP,  [x4, 128]
         FMLA v31.4s, v19.4s,  v2.s[3]
+        PRFM PLDL1KEEP, [x5, 256]  // Prefetch B
 
         # Second group of 24 FMA
         FMLA v20.4s, v12.4s,  v3.s[0]
+        PRFM PLDL1KEEP, [x5, 320]  // Prefetch B
         FMLA v22.4s, v12.4s,  v3.s[2]
         FMLA v24.4s, v12.4s,  v4.s[0]
         FMLA v26.4s, v12.4s,  v4.s[2]
diff --git a/src/init.c b/src/init.c
index 8f6975c..b9a8ab3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -323,12 +323,12 @@
         case cpuinfo_uarch_cortex_a53:
         case cpuinfo_uarch_cortex_a55:
           xnn_params.f32.gemm = (struct gemm_parameters) {
-            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
-            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
-            .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
-            .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
-            .mr = 4,
-            .nr = 12,
+            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+            .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+            .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+            .mr = 6,
+            .nr = 8,
           };
           break;
         case cpuinfo_uarch_cortex_a73: