6x8 A53 GEMM use prefetch.
20.88% faster than previous 6x8. 10.6% faster than 4x12.
Switch cpu detect for A53 to select 6x8 by default.
PiperOrigin-RevId: 277602956
diff --git a/src/init.c b/src/init.c
index 8f6975c..b9a8ab3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -323,12 +323,12 @@
case cpuinfo_uarch_cortex_a53:
case cpuinfo_uarch_cortex_a55:
xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53,
- .mr = 4,
- .nr = 12,
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .mr = 6,
+ .nr = 8,
};
break;
case cpuinfo_uarch_cortex_a73: