fp32 IGEMM 4x8 and 6x8 ld64 microkernels

PiperOrigin-RevId: 378945611
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 6b2abc1..a416ab8 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -354,6 +354,10 @@
     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
       xnn_init_f32_minmax_scalar_params);
   }
+  static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
+      xnn_init_f32_minmax_scalar_params);
+  }
   static void f32_igemm_5x8__aarch64_neonfma_cortex_a57(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57, 5, 8, 1, 1,
       xnn_init_f32_minmax_scalar_params);
@@ -386,6 +390,10 @@
     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
       xnn_init_f32_minmax_scalar_params);
   }
+  static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
+      xnn_init_f32_minmax_scalar_params);
+  }
   static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
       xnn_init_f32_minmax_scalar_params);
@@ -424,6 +432,7 @@
   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
@@ -431,6 +440,7 @@
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
   BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
   BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
   BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)