A53 GEMM / IGEMM kernel prefetches adjust by 1

The x5 register used for weights advances by 1 cache line... 64 bytes in most kernels, and 96 bytes in 4x12.
But the prefetch offsets dont account for this, and skip a cache line.  Adjust offsets by 1 cache line
4x12 consumes 192 bytes - 3 cache lines, so do 3 prefetches.

End To End was
f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time     109863 us
4x12 is submitted
f32_gemm_6x8__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time       96928 us
f32_gemm_4x8__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time      106907 us

Now
f32_gemm_6x8__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time       95999 us
f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time     102843 us
f32_gemm_4x8__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time      104823 us

PiperOrigin-RevId: 289984651
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
index b77c351..37860d0 100644
--- a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
@@ -118,8 +118,6 @@
           PRFM PLDL1KEEP, [x5,  64]
           PRFM PLDL1KEEP, [x5, 128]
           PRFM PLDL1KEEP, [x5, 192]
-          PRFM PLDL1KEEP, [x5, 256]
-          PRFM PLDL1KEEP, [x5, 320]
         $else:
           # Load initial bias from w into accumulators
           LDP q20, q21, [x5], 32
@@ -136,13 +134,11 @@
           PRFM PLDL1KEEP, [x11,  0]
           PRFM PLDL1KEEP, [x11, 64]
           MOV v26.16b, v20.16b
-          MOV v27.16b, v21.16b
           PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+          MOV v27.16b, v21.16b
           PRFM PLDL1KEEP, [x5,  64]
           PRFM PLDL1KEEP, [x5, 128]
           PRFM PLDL1KEEP, [x5, 192]
-          PRFM PLDL1KEEP, [x5, 256]
-          PRFM PLDL1KEEP, [x5, 320]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -261,7 +257,7 @@
         LDR  x4, [x5, 104]
         FMLA v23.4s, v15.4s,  v3.s[3]
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
 
         // BLOCK 5
         // NOTE that block needs to be 4 cycles for LDR not to stall
@@ -270,7 +266,7 @@
         FMLA v27.4s, v15.4s,  v4.s[3]
         LDR  x4, [x5, 120]
         SUBS x0, x0, 16
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
         ADD x5, x5, 128
         B.HS 1b
 
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
index cc6f1d6..edff571 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
@@ -136,8 +136,6 @@
           PRFM PLDL1KEEP, [x5,  64]
           PRFM PLDL1KEEP, [x5, 128]
           PRFM PLDL1KEEP, [x5, 192]
-          PRFM PLDL1KEEP, [x5, 256]
-          PRFM PLDL1KEEP, [x5, 320]
         $else:
           # Load initial bias from w into accumulators
           LDP q20, q21, [x5], 32
@@ -161,14 +159,12 @@
           PRFM PLDL1KEEP,  [x4, 64]
           MOV v28.16b, v20.16b
           PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
-          PRFM PLDL1KEEP, [x5,  64]
           MOV v29.16b, v21.16b
-          PRFM PLDL1KEEP, [x5, 128]
-          PRFM PLDL1KEEP, [x5, 192]
+          PRFM PLDL1KEEP, [x5,  64]
           MOV v30.16b, v20.16b
-          PRFM PLDL1KEEP, [x5, 256]
-          PRFM PLDL1KEEP, [x5, 320]
+          PRFM PLDL1KEEP, [x5, 128]
           MOV v31.16b, v21.16b
+          PRFM PLDL1KEEP, [x5, 192]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -316,9 +312,9 @@
         FMLA v21.4s, v15.4s,  v3.s[1]
         LDR  x19, [x5, 120]
         FMLA v23.4s, v15.4s,  v3.s[3]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
 
         // BLOCK 7
         SUBS x0, x0, 16  // LDR lands here
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
index c857ff8..9b447aa 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
@@ -114,8 +114,6 @@
         PRFM PLDL1KEEP, [x5,  64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -234,7 +232,7 @@
         LDR  x4, [x5, 104]
         FMLA v23.4s, v15.4s,  v3.s[3]
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
 
         // BLOCK 5
         // NOTE that block needs to be 4 cycles for LDR not to stall
@@ -243,7 +241,7 @@
         FMLA v27.4s, v15.4s,  v4.s[3]
         LDR  x4, [x5, 120]
         SUBS x0, x0, 16
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
         ADD x5, x5, 128
         B.HS 1b
 
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
index eb32024..163e642 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
@@ -132,8 +132,6 @@
         PRFM PLDL1KEEP, [x5,  64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -281,9 +279,9 @@
         FMLA v21.4s, v15.4s,  v3.s[1]
         LDR  x19, [x5, 120]
         FMLA v23.4s, v15.4s,  v3.s[3]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
 
         // BLOCK 7
         SUBS x0, x0, 16  // LDR lands here
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
index 212d85a..a0674a2 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
@@ -111,13 +111,11 @@
         PRFM PLDL1KEEP, [x11,  0]
         PRFM PLDL1KEEP, [x11, 64]
         MOV v26.16b, v20.16b
-        MOV v27.16b, v21.16b
         PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
+        MOV v27.16b, v21.16b
         PRFM PLDL1KEEP, [x5,  64]
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -236,7 +234,7 @@
         LDR  x4, [x5, 104]
         FMLA v23.4s, v15.4s,  v3.s[3]
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
 
         // BLOCK 5
         // NOTE that block needs to be 4 cycles for LDR not to stall
@@ -245,7 +243,7 @@
         FMLA v27.4s, v15.4s,  v4.s[3]
         LDR  x4, [x5, 120]
         SUBS x0, x0, 16
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
         ADD x5, x5, 128
         B.HS 1b
 
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
index b0fdb18..7689e88 100644
--- a/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
@@ -130,14 +130,12 @@
         PRFM PLDL1KEEP,  [x4, 64]
         MOV v28.16b, v20.16b
         PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
-        PRFM PLDL1KEEP, [x5,  64]
         MOV v29.16b, v21.16b
-        PRFM PLDL1KEEP, [x5, 128]
-        PRFM PLDL1KEEP, [x5, 192]
+        PRFM PLDL1KEEP, [x5,  64]
         MOV v30.16b, v20.16b
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
+        PRFM PLDL1KEEP, [x5, 128]
         MOV v31.16b, v21.16b
+        PRFM PLDL1KEEP, [x5, 192]
 
         # Is there at least 4 floats (16 bytes) for prologue + epilogue?
         SUBS x0, x2, 16  // k = kc - 16
@@ -285,9 +283,9 @@
         FMLA v21.4s, v15.4s,  v3.s[1]
         LDR  x19, [x5, 120]
         FMLA v23.4s, v15.4s,  v3.s[3]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
 
         // BLOCK 7
         SUBS x0, x0, 16  // LDR lands here
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
index 79e91b6..a6cdb85 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a53.S
@@ -110,8 +110,6 @@
         MOV v27.16b, v21.16b
         PRFM PLDL1KEEP, [x5, 128]
         PRFM PLDL1KEEP, [x5, 192]
-        PRFM PLDL1KEEP, [x5, 256]
-        PRFM PLDL1KEEP, [x5, 320]
 
         MOV x9, x3  // p = ks
 
@@ -251,7 +249,7 @@
         LDR  x19, [x5, 104]
         FMLA v23.4s, v15.4s,  v3.s[3]
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
 
         // BLOCK 5
         // NOTE that block needs to be 4 cycles for LDR not to stall
@@ -260,7 +258,7 @@
         FMLA v27.4s, v15.4s,  v4.s[3]
         LDR  x19, [x5, 120]
         SUBS x0, x0, 16
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
         ADD x5, x5, 128
         B.HS 2b
 
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
index 4cf5dab..c4ea8a0 100644
--- a/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-igemm/6x8-aarch64-neonfma-cortex-a53.S
@@ -109,6 +109,7 @@
         MOV v25.16b, v21.16b
         PRFM PLDL1KEEP, [x5, 128]
         MOV v26.16b, v20.16b
+        PRFM PLDL1KEEP, [x5, 192]
         MOV v27.16b, v21.16b
         MOV v28.16b, v20.16b
         MOV v29.16b, v21.16b
@@ -288,9 +289,9 @@
         FMLA v21.4s, v15.4s,  v3.s[1]
         LDR  x19, [x5, 120]
         FMLA v23.4s, v15.4s,  v3.s[3]
-        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
         FMLA v25.4s, v15.4s,  v4.s[1]
-        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
+        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
 
         // BLOCK 7
         SUBS x0, x0, 16  // LDR lands here