4x8 GEMM and IGEMM microkernels for AARCH32 Cortex A55.  11.5% faster end to end:
Was f32_gemm_4x8__aarch32_neon_cortex_a53/mobilenet_v2/real_time         154006 us
Now f32_gemm_4x8__aarch32_neon_cortex_a55/mobilenet_v2/real_time         138030 us

23% faster GEMM on mobilenet_v2
Was f32_gemm_4x8__aarch32_neon_cortex_a53                    59909460
Now f32_gemm_4x8__aarch32_neon_cortex_a55                    48681160

19.2% faster IGEMM on mobilenet_v2
Was f32_igemm_4x8__aarch32_neon_cortex_a53                   67209225
Now f32_igemm_4x8__aarch32_neon_cortex_a55                   56380323

End2end benchmark
Was
MobileNetV1/T:1/real_time          236793 us
MobileNetV2/T:1/real_time          154689 us
MobileNetV3Large/T:1/real_time     130964 us
MobileNetV3Small/T:1/real_time      42383 us

Now
MobileNetV1/T:1/real_time          199053 us
MobileNetV2/T:1/real_time          140262 us
MobileNetV3Large/T:1/real_time     120468 us
MobileNetV3Small/T:1/real_time      39952 us

The rev 1 version of Cortex A55 can co-issue a 64 bit
vector load with each FMA, so re-arrange the Cortex-A53
microkernel with 3 FMA paired with 2 loads.

Basic code block is 3 VMLA with 2 VLD64:

// BLOCK 0
VMLA.F32     q8, q4, d4[0]
VLD1.32    {d0}, [r3]!       // A0
VMLA.F32    q10, q4, d5[0]
VLD1.32    {d1}, [r12]!      // A1
VMLA.F32    q12, q4, d6[0]

PiperOrigin-RevId: 300384515
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 10bcbee..1aa6fcd 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -262,8 +262,24 @@
   static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1);
   }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
+  }
+  static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
+  }
 
   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
 #endif  /* XNN_ARCH_ARM */
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
@@ -379,22 +395,6 @@
   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
 #endif  /* XNN_ARCH_ARM64 */
 
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-  static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
-  }
-  static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
-  }
-  static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
-  }
-
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
-  BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
-#endif  /* XNN_ARCH_ARM */
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);