QS8 Neon GEMM microkernel with 8 bit multiply

PiperOrigin-RevId: 351893800
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index a231fce..63e09f3 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -296,6 +296,18 @@
   static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_2x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x8__neon_mull_addw_dup, 2, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8__neon_mull_addw_dup, 4, 8, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup, 2, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16__neon_mull_addw_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup, 4, 16, 1, 1, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot, 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
   }
@@ -328,6 +340,10 @@
   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_2x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16__neon_mull_addw_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16__neon_mull_addw_dup)
   BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)