4x16 QS8 GEMM AARCH64 assembly microkernel using dot product.

PiperOrigin-RevId: 335584437
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index c6b34c1..25cdc12 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -221,6 +221,15 @@
   BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+#if XNN_ARCH_ARM64
+  static void qs8_gemm_4x16c4__aarch64_neondot(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot, 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
+  }
+
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot)
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1);