GEMM/IGEMM implementations in WAsm SIMD intrinsics

PiperOrigin-RevId: 316620752
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 45ee132..9ea20e3 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -506,6 +506,129 @@
   BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
 #endif  /* XNN_ARCH_X86 || XNN_ARCH_X86_64 */
 
+#if XNN_ARCH_WASMSIMD
+  static void f32_igemm_3x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_loadsplat_arm, 3, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_loadsplat_arm, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_loadsplat_arm, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__wasmsimd_loadsplat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_loadsplat_arm, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_3x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_loadsplat_x86, 3, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_loadsplat_x86, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_loadsplat_x86, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__wasmsimd_loadsplat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_loadsplat_x86, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_3x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_splat_arm, 3, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_splat_arm, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_splat_arm, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__wasmsimd_splat_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_splat_arm, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_3x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_splat_x86, 3, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_splat_x86, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_5x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_splat_x86, 5, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__wasmsimd_splat_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_splat_x86, 6, 8, 1, 1);
+  }
+
+  static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4);
+  }
+
+  static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4);
+  }
+
+  static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4);
+  }
+
+  static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4);
+  }
+
+  static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4);
+  }
+
+  static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4);
+  }
+
+  static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4);
+  }
+
+  static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4);
+  }
+
+  BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_loadsplat_arm)
+  BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_loadsplat_arm)
+  BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_loadsplat_arm)
+  BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_loadsplat_arm)
+  BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_loadsplat_x86)
+  BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_loadsplat_x86)
+  BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_loadsplat_x86)
+  BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_loadsplat_x86)
+  BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_splat_arm)
+  BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_splat_arm)
+  BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_splat_arm)
+  BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_splat_arm)
+  BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_splat_x86)
+  BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_splat_x86)
+  BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_splat_x86)
+  BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_splat_x86)
+  BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
+  BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
+  BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
+  BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
+  BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
+  BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
+  BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
+  BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
+#endif  // XNN_ARCH_WASMSIMD
+
 #if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
   static void f32_igemm_1x8__psimd_loadsplat(benchmark::State& state, const char* net) {
     IGEMMBenchmark(state, xnn_f32_igemm_minmax_ukernel_1x8__psimd_loadsplat, 1, 8, 1, 1);