6x8 ld128 GEMM microkernels

12 new kernels
dup and lane
neon and neonfma
GEMM, GEMMINC and GEMM

PiperOrigin-RevId: 282661111
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 1124d83..932d47e 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -414,17 +414,8 @@
   static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
   }
-  static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1);
-  }
-  static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1);
-  }
-  static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1);
-  }
-  static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1);
+  static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1);
   }
   BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
@@ -444,82 +435,89 @@
   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
   BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
-  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
   BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
-  BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
-  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
-  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
-  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
-
+  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
 #endif  // XNN_ARCH_ARM64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEON);
   }
-
+  static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1, benchmark::utils::CheckNEONFMA);
+  }
+  static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
+  }
+  static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1, benchmark::utils::CheckNEONFMA);
+  }
+  static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
+  }
+  static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1, benchmark::utils::CheckNEONFMA);
+  }
   static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neon, 1, 8, 1, 4, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8s4__neonfma, 1, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
-
   static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neon, 4, 8, 1, 4, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8s4__neonfma, 4, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
-
   static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neon, 6, 8, 1, 4, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8s4__neonfma, 6, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
-
   static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neon, 8, 8, 1, 4, benchmark::utils::CheckNEON);
   }
-
   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4, benchmark::utils::CheckNEONFMA);
   }
-
   static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
     PPMM1PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
   }
-
   static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
     PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8, benchmark::utils::CheckNEONFMA);
   }
 
   BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
-  BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
   BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
   BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
+  BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
   BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
   BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
   BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
@@ -731,7 +729,6 @@
 BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
 BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
 
-
 #ifdef BENCHMARK_RUY
 BENCHMARK_GEMM(ruy_st)
 #endif  // BENCHMARK_RUY