1x8 neonfma IGEMM microkernel and 1x8 benchmarks.

Add new 1x8 neonfma IGEMM intrinsics microkernel.
Add benchmarks for 1x8 neon and neonfma IGEMM microkernels.
BUG=142398150,140592595
PiperOrigin-RevId: 273849950
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index bbe9407..0d7e0da 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -151,6 +151,10 @@
 }
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  static void f32_igemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
+  }
+
   static void f32_igemm_4x2__neon_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_ld64, 4, 2, 1, 1);
   }
@@ -175,6 +179,10 @@
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
   }
 
+  static void f32_igemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
+  }
+
   static void f32_igemm_4x2__neonfma_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neonfma_ld64, 4, 2, 1, 1);
   }
@@ -199,6 +207,8 @@
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
   }
 
+  BENCHMARK_CONV(f32_igemm_1x8__neon_ld64)
+  BENCHMARK_CONV(f32_igemm_1x8__neonfma_ld64)
   BENCHMARK_CONV(f32_igemm_4x12__neon_ld64)
   BENCHMARK_CONV(f32_igemm_4x12__neonfma_ld64)
   BENCHMARK_CONV(f32_igemm_4x2__neon_ld64)