6x8 GEMM and IGEMM microkernels for Cortex A55. 9% faster end to end:
Was f32_gemm_6x8__aarch64_neonfma_cortex_a53/mobilenet_v2/real_time 121711 us
Now f32_gemm_6x8__aarch64_neonfma_cortex_a55/mobilenet_v2/real_time 111573 us
18% faster GEMM on mobilenet_v2
Was f32_gemm_6x8__aarch64_neonfma_cortex_a53 50324300
Now f32_gemm_6x8__aarch64_neonfma_cortex_a55 42632534
15% faster IGEMM on mobilenet_v2
Was f32_igemm_6x8__aarch64_neonfma_cortex_a53 55920696
Now f32_igemm_6x8__aarch64_neonfma_cortex_a55 48353584
The rev 1 version of Cortex A55 can co-issue a 64 bit
vector load with each FMA, so re-arrange the Cortex-A53
microkernel with 3 FMA paired with 2 loads and INS.
PiperOrigin-RevId: 299988395
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 0e8fd5f..10bcbee 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -311,6 +311,10 @@
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1);
}
+ static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1);
+ }
+
static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1);
}
@@ -362,6 +366,7 @@
BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a57)
BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
+ BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a57)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)