4x12 A53 kernel use prefetches on A
GEMM bench
Was
f32_gemm_4x12__aarch64_neonfma_cortex_a53 47765525
Now
f32_gemm_4x12__aarch64_neonfma_cortex_a53 46775324
e2e mobvilenet_v2
Was
f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time 111701 us
Now
e2e: f32_gemm_4x12__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time 109856 us
PiperOrigin-RevId: 289131813
diff --git a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
index 3e8691c..2b48cae 100644
--- a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
+++ b/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
@@ -103,26 +103,47 @@
LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
+ PRFM PLDL1KEEP, [x3, 0] // Prefetch A
+ PRFM PLDL1KEEP, [x3, 64]
+ PRFM PLDL1KEEP, [x11, 0]
+ PRFM PLDL1KEEP, [x11, 64]
+ PRFM PLDL1KEEP, [x12, 0]
+ PRFM PLDL1KEEP, [x12, 64]
+ PRFM PLDL1KEEP, [x4, 0]
+ PRFM PLDL1KEEP, [x4, 64]
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ PRFM PLDL1KEEP, [x5, 64]
+ PRFM PLDL1KEEP, [x5, 128]
+ PRFM PLDL1KEEP, [x5, 192]
+ PRFM PLDL1KEEP, [x5, 256]
+ PRFM PLDL1KEEP, [x5, 320]
$else:
# Load initial bias from w into accumulators
LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
MOV v23.16b, v20.16b
+ PRFM PLDL1KEEP, [x3, 0] // Prefetch A
+ PRFM PLDL1KEEP, [x3, 64]
MOV v24.16b, v21.16b
+ PRFM PLDL1KEEP, [x11, 0]
+ PRFM PLDL1KEEP, [x11, 64]
MOV v25.16b, v22.16b
+ PRFM PLDL1KEEP, [x12, 0]
+ PRFM PLDL1KEEP, [x12, 64]
MOV v26.16b, v20.16b
+ PRFM PLDL1KEEP, [x4, 0]
+ PRFM PLDL1KEEP, [x4, 64]
MOV v27.16b, v21.16b
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ PRFM PLDL1KEEP, [x5, 64]
MOV v28.16b, v22.16b
+ PRFM PLDL1KEEP, [x5, 128]
+ PRFM PLDL1KEEP, [x5, 192]
MOV v29.16b, v20.16b
+ PRFM PLDL1KEEP, [x5, 256]
+ PRFM PLDL1KEEP, [x5, 320]
MOV v30.16b, v21.16b
MOV v31.16b, v22.16b
- PRFM PLDL1KEEP, [x5]
- PRFM PLDL1KEEP, [x5, 64]
- PRFM PLDL1KEEP, [x5, 128]
- PRFM PLDL1KEEP, [x5, 192]
- PRFM PLDL1KEEP, [x5, 256]
- PRFM PLDL1KEEP, [x5, 320]
-
# Is there at least 4 floats (16 bytes)?
SUBS x0, x2, 16 // k = kc - 16
B.LO 3f
@@ -174,8 +195,8 @@
FMLA v20.4s, v6.4s, v0.s[0]
LDR x13, [x11], 8 // a1
FMLA v23.4s, v6.4s, v0.s[2]
- PRFM PLDL1KEEP, [x5, 192]
FMLA v26.4s, v6.4s, v1.s[0]
+ PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
# BLOCK 1
LDR d3, [x12], 8 // a2
@@ -183,8 +204,8 @@
FMLA v29.4s, v6.4s, v1.s[2]
LDR x8, [x4], 8 // a3
FMLA v21.4s, v7.4s, v0.s[0]
- PRFM PLDL1KEEP, [x5, 256]
FMLA v24.4s, v7.4s, v0.s[2]
+ PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
# BLOCK 2
LDR d14, [x5] // vb0x0123
@@ -192,8 +213,8 @@
FMLA v27.4s, v7.4s, v1.s[0]
LDR x20, [x5, 8]
FMLA v30.4s, v7.4s, v1.s[2]
- PRFM PLDL1KEEP, [x5, 320]
FMLA v22.4s, v8.4s, v0.s[0]
+ PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
# BLOCK 3
LDR d15, [x5, 16] // vb0x4567
@@ -202,6 +223,7 @@
LDR x21, [x5, 24]
FMLA v28.4s, v8.4s, v1.s[0]
FMLA v31.4s, v8.4s, v1.s[2]
+ PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
# BLOCK 4
LDR d16, [x5, 32] // vb0x89AB
@@ -210,6 +232,7 @@
LDR x16, [x5, 40]
FMLA v23.4s, v9.4s, v0.s[3]
FMLA v26.4s, v9.4s, v1.s[1]
+ PRFM PLDL1KEEP, [x5, 384] // Prefetch B
# BLOCK 5
LDR d17, [x5, 48] // vb1x0123
@@ -218,6 +241,7 @@
LDR x17, [x5, 56]
FMLA v21.4s, v10.4s, v0.s[1]
FMLA v24.4s, v10.4s, v0.s[3]
+ PRFM PLDL1KEEP, [x5, 448] // Prefetch B
# BLOCK 6
LDR d18, [x5, 64] // vb1x4567
diff --git a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
index e53e5a2..e32eccc 100644
--- a/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
@@ -99,9 +99,16 @@
LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
-
- PRFM PLDL1KEEP, [x5]
- PRFM PLDL1KEEP, [x5, 64]
+ PRFM PLDL1KEEP, [x3, 0] // Prefetch A
+ PRFM PLDL1KEEP, [x3, 64]
+ PRFM PLDL1KEEP, [x11, 0]
+ PRFM PLDL1KEEP, [x11, 64]
+ PRFM PLDL1KEEP, [x12, 0]
+ PRFM PLDL1KEEP, [x12, 64]
+ PRFM PLDL1KEEP, [x4, 0]
+ PRFM PLDL1KEEP, [x4, 64]
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ PRFM PLDL1KEEP, [x5, 64]
PRFM PLDL1KEEP, [x5, 128]
PRFM PLDL1KEEP, [x5, 192]
PRFM PLDL1KEEP, [x5, 256]
@@ -158,8 +165,8 @@
FMLA v20.4s, v6.4s, v0.s[0]
LDR x13, [x11], 8 // a1
FMLA v23.4s, v6.4s, v0.s[2]
- PRFM PLDL1KEEP, [x5, 192]
FMLA v26.4s, v6.4s, v1.s[0]
+ PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
# BLOCK 1
LDR d3, [x12], 8 // a2
@@ -167,8 +174,8 @@
FMLA v29.4s, v6.4s, v1.s[2]
LDR x8, [x4], 8 // a3
FMLA v21.4s, v7.4s, v0.s[0]
- PRFM PLDL1KEEP, [x5, 256]
FMLA v24.4s, v7.4s, v0.s[2]
+ PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
# BLOCK 2
LDR d14, [x5] // vb0x0123
@@ -176,8 +183,8 @@
FMLA v27.4s, v7.4s, v1.s[0]
LDR x20, [x5, 8]
FMLA v30.4s, v7.4s, v1.s[2]
- PRFM PLDL1KEEP, [x5, 320]
FMLA v22.4s, v8.4s, v0.s[0]
+ PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
# BLOCK 3
LDR d15, [x5, 16] // vb0x4567
@@ -186,6 +193,7 @@
LDR x21, [x5, 24]
FMLA v28.4s, v8.4s, v1.s[0]
FMLA v31.4s, v8.4s, v1.s[2]
+ PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
# BLOCK 4
LDR d16, [x5, 32] // vb0x89AB
@@ -194,6 +202,7 @@
LDR x16, [x5, 40]
FMLA v23.4s, v9.4s, v0.s[3]
FMLA v26.4s, v9.4s, v1.s[1]
+ PRFM PLDL1KEEP, [x5, 384] // Prefetch B
# BLOCK 5
LDR d17, [x5, 48] // vb1x0123
@@ -202,6 +211,7 @@
LDR x17, [x5, 56]
FMLA v21.4s, v10.4s, v0.s[1]
FMLA v24.4s, v10.4s, v0.s[3]
+ PRFM PLDL1KEEP, [x5, 448] // Prefetch B
# BLOCK 6
LDR d18, [x5, 64] // vb1x4567
diff --git a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
index 30bdcb7..78686e7 100644
--- a/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+++ b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
@@ -94,21 +94,28 @@
# Load initial bias from w into accumulators
LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
MOV v23.16b, v20.16b
+ PRFM PLDL1KEEP, [x3, 0] // Prefetch A
+ PRFM PLDL1KEEP, [x3, 64]
MOV v24.16b, v21.16b
+ PRFM PLDL1KEEP, [x11, 0]
+ PRFM PLDL1KEEP, [x11, 64]
MOV v25.16b, v22.16b
+ PRFM PLDL1KEEP, [x12, 0]
+ PRFM PLDL1KEEP, [x12, 64]
MOV v26.16b, v20.16b
+ PRFM PLDL1KEEP, [x4, 0]
+ PRFM PLDL1KEEP, [x4, 64]
MOV v27.16b, v21.16b
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ PRFM PLDL1KEEP, [x5, 64]
MOV v28.16b, v22.16b
- MOV v29.16b, v20.16b
- MOV v30.16b, v21.16b
- MOV v31.16b, v22.16b
-
- PRFM PLDL1KEEP, [x5]
- PRFM PLDL1KEEP, [x5, 64]
PRFM PLDL1KEEP, [x5, 128]
PRFM PLDL1KEEP, [x5, 192]
+ MOV v29.16b, v20.16b
PRFM PLDL1KEEP, [x5, 256]
PRFM PLDL1KEEP, [x5, 320]
+ MOV v30.16b, v21.16b
+ MOV v31.16b, v22.16b
# Is there at least 4 floats (16 bytes)?
SUBS x0, x2, 16 // k = kc - 16
@@ -161,8 +168,8 @@
FMLA v20.4s, v6.4s, v0.s[0]
LDR x13, [x11], 8 // a1
FMLA v23.4s, v6.4s, v0.s[2]
- PRFM PLDL1KEEP, [x5, 192]
FMLA v26.4s, v6.4s, v1.s[0]
+ PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
# BLOCK 1
LDR d3, [x12], 8 // a2
@@ -170,8 +177,8 @@
FMLA v29.4s, v6.4s, v1.s[2]
LDR x8, [x4], 8 // a3
FMLA v21.4s, v7.4s, v0.s[0]
- PRFM PLDL1KEEP, [x5, 256]
FMLA v24.4s, v7.4s, v0.s[2]
+ PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
# BLOCK 2
LDR d14, [x5] // vb0x0123
@@ -179,8 +186,8 @@
FMLA v27.4s, v7.4s, v1.s[0]
LDR x20, [x5, 8]
FMLA v30.4s, v7.4s, v1.s[2]
- PRFM PLDL1KEEP, [x5, 320]
FMLA v22.4s, v8.4s, v0.s[0]
+ PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
# BLOCK 3
LDR d15, [x5, 16] // vb0x4567
@@ -189,6 +196,7 @@
LDR x21, [x5, 24]
FMLA v28.4s, v8.4s, v1.s[0]
FMLA v31.4s, v8.4s, v1.s[2]
+ PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
# BLOCK 4
LDR d16, [x5, 32] // vb0x89AB
@@ -197,6 +205,7 @@
LDR x16, [x5, 40]
FMLA v23.4s, v9.4s, v0.s[3]
FMLA v26.4s, v9.4s, v1.s[1]
+ PRFM PLDL1KEEP, [x5, 384] // Prefetch B
# BLOCK 5
LDR d17, [x5, 48] // vb1x0123
@@ -205,6 +214,7 @@
LDR x17, [x5, 56]
FMLA v21.4s, v10.4s, v0.s[1]
FMLA v24.4s, v10.4s, v0.s[3]
+ PRFM PLDL1KEEP, [x5, 448] // Prefetch B
# BLOCK 6
LDR d18, [x5, 64] // vb1x4567