4x8 GEMM and IGEMM microkernels for AARCH32 Cortex A55. 11.5% faster end to end: Was f32_gemm_4x8__aarch32_neon_cortex_a53/mobilenet_v2/real_time 154006 us Now f32_gemm_4x8__aarch32_neon_cortex_a55/mobilenet_v2/real_time 138030 us 23% faster GEMM on mobilenet_v2 Was f32_gemm_4x8__aarch32_neon_cortex_a53 59909460 Now f32_gemm_4x8__aarch32_neon_cortex_a55 48681160 19.2% faster IGEMM on mobilenet_v2 Was f32_igemm_4x8__aarch32_neon_cortex_a53 67209225 Now f32_igemm_4x8__aarch32_neon_cortex_a55 56380323 End2end benchmark Was MobileNetV1/T:1/real_time 236793 us MobileNetV2/T:1/real_time 154689 us MobileNetV3Large/T:1/real_time 130964 us MobileNetV3Small/T:1/real_time 42383 us Now MobileNetV1/T:1/real_time 199053 us MobileNetV2/T:1/real_time 140262 us MobileNetV3Large/T:1/real_time 120468 us MobileNetV3Small/T:1/real_time 39952 us The rev 1 version of Cortex A55 can co-issue a 64 bit vector load with each FMA, so re-arrange the Cortex-A53 microkernel with 3 FMA paired with 2 loads. Basic code block is 3 VMLA with 2 VLD64: // BLOCK 0 VMLA.F32 q8, q4, d4[0] VLD1.32 {d0}, [r3]! // A0 VMLA.F32 q10, q4, d5[0] VLD1.32 {d1}, [r12]! // A1 VMLA.F32 q12, q4, d6[0] PiperOrigin-RevId: 300384515

commit: b7dd29e2b63c699e62a3955dbc1d3db1cb204c13 [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Wed Mar 11 12:37:10 2020 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Wed Mar 11 12:37:46 2020 -0700
tree: 14dd97391b8b8cfedd9983eb7ad7e2573349fd53
parent: d6a5463703b09e05dd1074a8aecc30a1c068bf15 [diff]
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 2523fda..190d55c 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc

@@ -283,6 +283,15 @@
       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
       benchmark::utils::CheckNEON);
   }
+  static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55,
+      xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55,
+      xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75,
@@ -304,6 +313,7 @@
 
   BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_ld64);
   BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
+  BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
   BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
   BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_pld_cortex_a75);
 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY

diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 8788f23..c68578f 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc

@@ -457,6 +457,9 @@
   static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
+  static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
@@ -466,6 +469,7 @@
 
   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
+  BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_pld_cortex_a75)
 #endif  // XNN_ARCH_ARM

diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 10bcbee..1aa6fcd 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc

@@ -262,8 +262,24 @@
   static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1);
   }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
+  }
 
   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
 #endif  /* XNN_ARCH_ARM */
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
@@ -379,22 +395,6 @@
   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
 #endif  /* XNN_ARCH_ARM64 */
 
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-  static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
-  }
-  static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
-  }
-  static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
-  }
-
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
-#endif  /* XNN_ARCH_ARM */
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
commit	b7dd29e2b63c699e62a3955dbc1d3db1cb204c13	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Wed Mar 11 12:37:10 2020 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Wed Mar 11 12:37:46 2020 -0700
tree	14dd97391b8b8cfedd9983eb7ad7e2573349fd53
parent	d6a5463703b09e05dd1074a8aecc30a1c068bf15 [diff]