1x16 QS8 GEMM AARCH64 assembly microkernel using dot product.

PiperOrigin-RevId: 335801231
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 25cdc12..f4fab42 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -188,6 +188,9 @@
   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot, 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot, 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
   }
@@ -200,6 +203,9 @@
   static void qs8_gemm_12x8c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot, 12, 8, 4, 1, benchmark::utils::CheckNEONDOT);
   }
+  static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot, 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
   static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
@@ -212,20 +218,26 @@
 
   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_12x8c4__neondot)
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
   BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_ARM64
+  static void qs8_gemm_1x16c4__aarch64_neondot(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot, 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
   static void qs8_gemm_4x16c4__aarch64_neondot(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
   }
 
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot)
 #endif  // XNN_ARCH_ARM64