FP16 4x8, 6x8 and 1x8 GEMM ld64 microkernels

PiperOrigin-RevId: 306697529
diff --git a/bench/f16-gemm.cc b/bench/f16-gemm.cc
index 9585f22..74a8564 100644
--- a/bench/f16-gemm.cc
+++ b/bench/f16-gemm.cc
@@ -136,11 +136,26 @@
     GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, 6, 16, 1);
   }
 
+  static void f16_gemm_1x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, 1, 8, 1);
+  }
+
+  static void f16_gemm_4x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, 4, 8, 1);
+  }
+
+  static void f16_gemm_6x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, 6, 8, 1);
+  }
+
   BENCHMARK_GEMM(f16_gemm_1x16__aarch64_neonfp16arith_ld32)
   BENCHMARK_GEMM(f16_gemm_4x16__aarch64_neonfp16arith_ld32)
   BENCHMARK_GEMM(f16_gemm_6x16__aarch64_neonfp16arith_ld32)
-
+  BENCHMARK_GEMM(f16_gemm_1x8__aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_4x8__aarch64_neonfp16arith_ld64)
+  BENCHMARK_GEMM(f16_gemm_6x8__aarch64_neonfp16arith_ld64)
 #endif  // XNN_ARCH_ARM64
+
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
 #endif