Neon shuffle GEMM and IGEMM kernels.

M1 is 7.1% faster on mobilenet_v2
M2 is 6.5% faster on mobilenet_v2

PiperOrigin-RevId: 281623279
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index b9095ec..0cde677 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -448,6 +448,38 @@
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
   }
 
+  static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4);
+  }
+
+  static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4);
+  }
+
+  static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4);
+  }
+
+  static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4);
+  }
+
+  static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4);
+  }
+
+  static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4);
+  }
+
+  static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4);
+  }
+
+  static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
+  }
+
   static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
     PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
   }
@@ -464,9 +496,14 @@
   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_ld64)
   BENCHMARK_GEMM(f32_gemm_5x8__neon_ld64)
   BENCHMARK_GEMM(f32_gemm_5x8__neonfma_ld64)
-  BENCHMARK_GEMM(f32_gemm_6x8__neon_ld64)
-  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_ld64)
-
+  BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
+  BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
+  BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
+  BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
+  BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
+  BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
+  BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
+  BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64