AVX and FMA3 microkernels for GEMM/GEMMINC/IGEMM
PiperOrigin-RevId: 281807374
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index de9ac11..2bfe51c 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -341,12 +341,67 @@
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8s4__sse, 4, 8, 1, 4);
}
+ static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__avx_broadcast, 1, 8, 1, 1);
+ }
+
+ static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__avx_broadcast, 4, 8, 1, 1);
+ }
+
+ static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__avx_broadcast, 5, 8, 1, 1);
+ }
+
+ static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__avx_broadcast, 6, 8, 1, 1);
+ }
+
+ static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__avx_broadcast, 7, 8, 1, 1);
+ }
+
+ static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1);
+ }
+
+ static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1);
+ }
+
+ static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1);
+ }
+
+ static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1);
+ }
+
+ static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1);
+ }
+
+ static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1);
+ }
+
BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
BENCHMARK_CONV(f32_igemm_1x8s4__sse)
BENCHMARK_CONV(f32_igemm_4x8s4__sse)
+ BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
+ BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
+ BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
+ BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
+ BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
+ BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
+ BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
+ BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
+ BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
+ BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
+ BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
#endif /* XNN_ARCH_X86 || XNN_ARCH_X86_64 */
#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS