4x12 A53 kernel use prefetches on A

GEMM bench
Was
f32_gemm_4x12__aarch64_neonfma_cortex_a53                47765525
Now
f32_gemm_4x12__aarch64_neonfma_cortex_a53                46775324

e2e mobvilenet_v2
Was
f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time     111701 us
Now
e2e:  f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time     109856 us
PiperOrigin-RevId: 289131813
diff --git a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
index 3e8691c..2b48cae 100644
--- a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
@@ -103,26 +103,47 @@
           LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
           LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
           LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
+          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
+          PRFM PLDL1KEEP,  [x3, 64]
+          PRFM PLDL1KEEP,  [x11,  0]
+          PRFM PLDL1KEEP,  [x11, 64]
+          PRFM PLDL1KEEP, [x12,  0]
+          PRFM PLDL1KEEP, [x12, 64]
+          PRFM PLDL1KEEP, [x4,  0]
+          PRFM PLDL1KEEP, [x4, 64]
+          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+          PRFM PLDL1KEEP, [x5,  64]
+          PRFM PLDL1KEEP, [x5, 128]
+          PRFM PLDL1KEEP, [x5, 192]
+          PRFM PLDL1KEEP, [x5, 256]
+          PRFM PLDL1KEEP, [x5, 320]
         $else:
           # Load initial bias from w into accumulators
           LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
           MOV v23.16b, v20.16b
+          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
+          PRFM PLDL1KEEP,  [x3, 64]
           MOV v24.16b, v21.16b
+          PRFM PLDL1KEEP,  [x11,  0]
+          PRFM PLDL1KEEP,  [x11, 64]
           MOV v25.16b, v22.16b
+          PRFM PLDL1KEEP, [x12,  0]
+          PRFM PLDL1KEEP, [x12, 64]
           MOV v26.16b, v20.16b
+          PRFM PLDL1KEEP, [x4,  0]
+          PRFM PLDL1KEEP, [x4, 64]
           MOV v27.16b, v21.16b
+          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+          PRFM PLDL1KEEP, [x5,  64]
           MOV v28.16b, v22.16b
+          PRFM PLDL1KEEP, [x5, 128]
+          PRFM PLDL1KEEP, [x5, 192]
           MOV v29.16b, v20.16b
+          PRFM PLDL1KEEP, [x5, 256]
+          PRFM PLDL1KEEP, [x5, 320]
           MOV v30.16b, v21.16b
           MOV v31.16b, v22.16b
 
-        PRFM PLDL1KEEP, [x5]
-        PRFM PLDL1KEEP, [x5, 64]
-        PRFM PLDL1KEEP, [x5, 128]
-        PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
-
         # Is there at least 4 floats (16 bytes)?
         SUBS x0, x2, 16  // k = kc - 16
         B.LO 3f
@@ -174,8 +195,8 @@
         FMLA v20.4s, v6.4s, v0.s[0]
         LDR x13, [x11], 8      // a1
         FMLA v23.4s, v6.4s, v0.s[2]
-        PRFM PLDL1KEEP, [x5, 192]
         FMLA v26.4s, v6.4s, v1.s[0]
+        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
 
         # BLOCK 1
         LDR d3, [x12], 8       // a2
@@ -183,8 +204,8 @@
         FMLA v29.4s, v6.4s, v1.s[2]
         LDR x8, [x4], 8        // a3
         FMLA v21.4s, v7.4s, v0.s[0]
-        PRFM PLDL1KEEP, [x5, 256]
         FMLA v24.4s, v7.4s, v0.s[2]
+        PRFM PLDL1KEEP, [x11, 128]      // Prefetch A1
 
         # BLOCK 2
         LDR d14, [x5]          // vb0x0123
@@ -192,8 +213,8 @@
         FMLA v27.4s, v7.4s, v1.s[0]
         LDR x20, [x5, 8]
         FMLA v30.4s, v7.4s, v1.s[2]
-        PRFM PLDL1KEEP, [x5, 320]
         FMLA v22.4s, v8.4s, v0.s[0]
+        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A2
 
         # BLOCK 3
         LDR d15, [x5, 16]      // vb0x4567
@@ -202,6 +223,7 @@
         LDR x21, [x5, 24]
         FMLA v28.4s, v8.4s, v1.s[0]
         FMLA v31.4s, v8.4s, v1.s[2]
+        PRFM PLDL1KEEP, [x4, 128]     // Prefetch A3
 
         # BLOCK 4
         LDR d16, [x5, 32]      // vb0x89AB
@@ -210,6 +232,7 @@
         LDR x16, [x5, 40]
         FMLA v23.4s, v9.4s, v0.s[3]
         FMLA v26.4s, v9.4s, v1.s[1]
+        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
 
         # BLOCK 5
         LDR d17, [x5, 48]      // vb1x0123
@@ -218,6 +241,7 @@
         LDR x17, [x5, 56]
         FMLA v21.4s, v10.4s, v0.s[1]
         FMLA v24.4s, v10.4s, v0.s[3]
+        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
 
         # BLOCK 6
         LDR d18, [x5, 64]       // vb1x4567
diff --git a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
index e53e5a2..e32eccc 100644
--- a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
@@ -99,9 +99,16 @@
         LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
         LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
         LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
-
-        PRFM PLDL1KEEP, [x5]
-        PRFM PLDL1KEEP, [x5, 64]
+        PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
+        PRFM PLDL1KEEP,  [x3, 64]
+        PRFM PLDL1KEEP,  [x11,  0]
+        PRFM PLDL1KEEP,  [x11, 64]
+        PRFM PLDL1KEEP, [x12,  0]
+        PRFM PLDL1KEEP, [x12, 64]
+        PRFM PLDL1KEEP, [x4,  0]
+        PRFM PLDL1KEEP, [x4, 64]
+        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+        PRFM PLDL1KEEP, [x5,  64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
         PRFM PLDL1KEEP, [x5, 256]
@@ -158,8 +165,8 @@
         FMLA v20.4s, v6.4s, v0.s[0]
         LDR x13, [x11], 8      // a1
         FMLA v23.4s, v6.4s, v0.s[2]
-        PRFM PLDL1KEEP, [x5, 192]
         FMLA v26.4s, v6.4s, v1.s[0]
+        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
 
         # BLOCK 1
         LDR d3, [x12], 8       // a2
@@ -167,8 +174,8 @@
         FMLA v29.4s, v6.4s, v1.s[2]
         LDR x8, [x4], 8        // a3
         FMLA v21.4s, v7.4s, v0.s[0]
-        PRFM PLDL1KEEP, [x5, 256]
         FMLA v24.4s, v7.4s, v0.s[2]
+        PRFM PLDL1KEEP, [x11, 128]      // Prefetch A1
 
         # BLOCK 2
         LDR d14, [x5]          // vb0x0123
@@ -176,8 +183,8 @@
         FMLA v27.4s, v7.4s, v1.s[0]
         LDR x20, [x5, 8]
         FMLA v30.4s, v7.4s, v1.s[2]
-        PRFM PLDL1KEEP, [x5, 320]
         FMLA v22.4s, v8.4s, v0.s[0]
+        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A2
 
         # BLOCK 3
         LDR d15, [x5, 16]      // vb0x4567
@@ -186,6 +193,7 @@
         LDR x21, [x5, 24]
         FMLA v28.4s, v8.4s, v1.s[0]
         FMLA v31.4s, v8.4s, v1.s[2]
+        PRFM PLDL1KEEP, [x4, 128]     // Prefetch A3
 
         # BLOCK 4
         LDR d16, [x5, 32]      // vb0x89AB
@@ -194,6 +202,7 @@
         LDR x16, [x5, 40]
         FMLA v23.4s, v9.4s, v0.s[3]
         FMLA v26.4s, v9.4s, v1.s[1]
+        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
 
         # BLOCK 5
         LDR d17, [x5, 48]      // vb1x0123
@@ -202,6 +211,7 @@
         LDR x17, [x5, 56]
         FMLA v21.4s, v10.4s, v0.s[1]
         FMLA v24.4s, v10.4s, v0.s[3]
+        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
 
         # BLOCK 6
         LDR d18, [x5, 64]       // vb1x4567
diff --git a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
index 30bdcb7..78686e7 100644
--- a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
@@ -94,21 +94,28 @@
         # Load initial bias from w into accumulators
         LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
         MOV v23.16b, v20.16b
+        PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
+        PRFM PLDL1KEEP,  [x3, 64]
         MOV v24.16b, v21.16b
+        PRFM PLDL1KEEP,  [x11,  0]
+        PRFM PLDL1KEEP,  [x11, 64]
         MOV v25.16b, v22.16b
+        PRFM PLDL1KEEP, [x12,  0]
+        PRFM PLDL1KEEP, [x12, 64]
         MOV v26.16b, v20.16b
+        PRFM PLDL1KEEP, [x4,  0]
+        PRFM PLDL1KEEP, [x4, 64]
         MOV v27.16b, v21.16b
+        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+        PRFM PLDL1KEEP, [x5,  64]
         MOV v28.16b, v22.16b
-        MOV v29.16b, v20.16b
-        MOV v30.16b, v21.16b
-        MOV v31.16b, v22.16b
-
-        PRFM PLDL1KEEP, [x5]
-        PRFM PLDL1KEEP, [x5, 64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
+        MOV v29.16b, v20.16b
         PRFM PLDL1KEEP, [x5, 256]
         PRFM PLDL1KEEP, [x5, 320]
+        MOV v30.16b, v21.16b
+        MOV v31.16b, v22.16b
 
         # Is there at least 4 floats (16 bytes)?
         SUBS x0, x2, 16  // k = kc - 16
@@ -161,8 +168,8 @@
         FMLA v20.4s, v6.4s, v0.s[0]
         LDR x13, [x11], 8      // a1
         FMLA v23.4s, v6.4s, v0.s[2]
-        PRFM PLDL1KEEP, [x5, 192]
         FMLA v26.4s, v6.4s, v1.s[0]
+        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
 
         # BLOCK 1
         LDR d3, [x12], 8       // a2
@@ -170,8 +177,8 @@
         FMLA v29.4s, v6.4s, v1.s[2]
         LDR x8, [x4], 8        // a3
         FMLA v21.4s, v7.4s, v0.s[0]
-        PRFM PLDL1KEEP, [x5, 256]
         FMLA v24.4s, v7.4s, v0.s[2]
+        PRFM PLDL1KEEP, [x11, 128]      // Prefetch A1
 
         # BLOCK 2
         LDR d14, [x5]          // vb0x0123
@@ -179,8 +186,8 @@
         FMLA v27.4s, v7.4s, v1.s[0]
         LDR x20, [x5, 8]
         FMLA v30.4s, v7.4s, v1.s[2]
-        PRFM PLDL1KEEP, [x5, 320]
         FMLA v22.4s, v8.4s, v0.s[0]
+        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A2
 
         # BLOCK 3
         LDR d15, [x5, 16]      // vb0x4567
@@ -189,6 +196,7 @@
         LDR x21, [x5, 24]
         FMLA v28.4s, v8.4s, v1.s[0]
         FMLA v31.4s, v8.4s, v1.s[2]
+        PRFM PLDL1KEEP, [x4, 128]     // Prefetch A3
 
         # BLOCK 4
         LDR d16, [x5, 32]      // vb0x89AB
@@ -197,6 +205,7 @@
         LDR x16, [x5, 40]
         FMLA v23.4s, v9.4s, v0.s[3]
         FMLA v26.4s, v9.4s, v1.s[1]
+        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
 
         # BLOCK 5
         LDR d17, [x5, 48]      // vb1x0123
@@ -205,6 +214,7 @@
         LDR x17, [x5, 56]
         FMLA v21.4s, v10.4s, v0.s[1]
         FMLA v24.4s, v10.4s, v0.s[3]
+        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
 
         # BLOCK 6
         LDR d18, [x5, 64]       // vb1x4567