QS8 C4 Neon GEMM and E2E benchmarks

PiperOrigin-RevId: 407932393
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 430832e..a125e50 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -427,6 +427,70 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, 4, 16, 2, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, 1, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, 2, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, 3, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, 4, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, 1, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, 2, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, 3, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, 4, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, 1, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, 2, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, 3, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, 4, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, 1, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, 2, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, 3, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, 4, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_1x8c8_gemmlowp__neon_mull_padal(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, 1, 8, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_neon_params, benchmark::utils::CheckNEON);
@@ -555,21 +619,26 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
   }
-
-  BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mlal_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_padal_dup)
@@ -586,6 +655,16 @@
   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_1x8c8_gemmlowp__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8c8_gemmlowp__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_3x8c8_gemmlowp__neon_mull_padal)