AVX and FMA3 microkernels for GEMM/GEMMINC/IGEMM

PiperOrigin-RevId: 281807374
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index de9ac11..2bfe51c 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -341,12 +341,67 @@
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
   }
 
+  static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+  }
+
+  static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+  }
+
+  static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+  }
+
   BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
   BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
   BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
   BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
   BENCHMARK_CONV(f32_igemm_1x8s4__sse)
   BENCHMARK_CONV(f32_igemm_4x8s4__sse)
+  BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
+  BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
+  BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
+  BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
+  BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
+  BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
+  BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
+  BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
+  BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
+  BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
+  BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
 #endif  /* XNN_ARCH_X86 || XNN_ARCH_X86_64 */
 
 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS