WAsm SIMD versions of [I]GEMM microkernels with NR=2
PiperOrigin-RevId: 320352097
diff --git a/BUILD.bazel b/BUILD.bazel
index 9e22c0a..cdcecd5 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -542,10 +542,13 @@
"src/f32-gemm/gen/1x8-wasmsimd-splat.c",
"src/f32-gemm/gen/4x8-wasmsimd-splat.c",
"src/f32-gemm/gen/5x8-wasmsimd-splat.c",
+ "src/f32-gemm/gen/4x2c4-wasmsimd.c",
"src/f32-gemm/gen/1x8-relu-wasmsimd-splat-arm.c",
"src/f32-gemm/gen/5x8-relu-wasmsimd-splat-arm.c",
"src/f32-gemm/gen/1x8-relu-wasmsimd-splat-x86.c",
"src/f32-gemm/gen/4x8-relu-wasmsimd-splat-x86.c",
+ "src/f32-gemm/gen/4x2c4-relu-wasmsimd-arm.c",
+ "src/f32-gemm/gen/4x2c4-relu-wasmsimd-x86.c",
"src/f32-gemm/gen/1x8-minmax-wasmsimd-loadsplat-arm.c",
"src/f32-gemm/gen/1x8-minmax-wasmsimd-splat-arm.c",
"src/f32-gemm/gen/1x8s4-minmax-wasmsimd-arm.c",
@@ -576,6 +579,8 @@
"src/f32-gemm/gen/6x8-minmax-wasmsimd-loadsplat-x86.c",
"src/f32-gemm/gen/6x8-minmax-wasmsimd-splat-x86.c",
"src/f32-gemm/gen/6x8s4-minmax-wasmsimd-x86.c",
+ "src/f32-gemm/gen/4x2c4-minmax-wasmsimd-arm.c",
+ "src/f32-gemm/gen/4x2c4-minmax-wasmsimd-x86.c",
"src/f32-gemm/gen-inc/1x8inc-minmax-wasmsimd-loadsplat-arm.c",
"src/f32-gemm/gen-inc/1x8inc-minmax-wasmsimd-splat-arm.c",
"src/f32-gemm/gen-inc/1x8s4inc-minmax-wasmsimd-arm.c",
@@ -609,10 +614,13 @@
"src/f32-igemm/gen/1x8-wasmsimd-splat.c",
"src/f32-igemm/gen/4x8-wasmsimd-splat.c",
"src/f32-igemm/gen/5x8-wasmsimd-splat.c",
+ "src/f32-igemm/gen/4x2c4-wasmsimd.c",
"src/f32-igemm/gen/1x8-relu-wasmsimd-splat-arm.c",
"src/f32-igemm/gen/5x8-relu-wasmsimd-splat-arm.c",
"src/f32-igemm/gen/1x8-relu-wasmsimd-splat-x86.c",
"src/f32-igemm/gen/4x8-relu-wasmsimd-splat-x86.c",
+ "src/f32-igemm/gen/4x2c4-relu-wasmsimd-arm.c",
+ "src/f32-igemm/gen/4x2c4-relu-wasmsimd-x86.c",
"src/f32-igemm/gen/1x8-minmax-wasmsimd-loadsplat-arm.c",
"src/f32-igemm/gen/1x8-minmax-wasmsimd-splat-arm.c",
"src/f32-igemm/gen/1x8s4-minmax-wasmsimd-arm.c",
@@ -643,6 +651,8 @@
"src/f32-igemm/gen/6x8-minmax-wasmsimd-loadsplat-x86.c",
"src/f32-igemm/gen/6x8-minmax-wasmsimd-splat-x86.c",
"src/f32-igemm/gen/6x8s4-minmax-wasmsimd-x86.c",
+ "src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c",
+ "src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c",
"src/f32-hswish/gen/wasmsimd-arm-x4.c",
"src/f32-hswish/gen/wasmsimd-arm-x8.c",
"src/f32-hswish/gen/wasmsimd-arm-x16.c",
diff --git a/scripts/generate-f32-gemm.sh b/scripts/generate-f32-gemm.sh
index d3f8165..a244156 100755
--- a/scripts/generate-f32-gemm.sh
+++ b/scripts/generate-f32-gemm.sh
@@ -323,6 +323,13 @@
tools/xngen src/f32-gemm/wasmsimd-s4.c.in -D MR=6 -D NR=8 -D X86=1 -D INC=0 -D ACTIVATION=MINMAX -o src/f32-gemm/gen/6x8s4-minmax-wasmsimd-x86.c
tools/xngen src/f32-gemm/wasmsimd-s4.c.in -D MR=6 -D NR=8 -D X86=1 -D INC=1 -D ACTIVATION=MINMAX -o src/f32-gemm/gen-inc/6x8s4inc-minmax-wasmsimd-x86.c
+### MRx2 micro-kernels
+tools/xngen src/f32-gemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=MINMAX -o src/f32-gemm/gen/4x2c4-minmax-wasmsimd-arm.c
+tools/xngen src/f32-gemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=1 -D ACTIVATION=MINMAX -o src/f32-gemm/gen/4x2c4-minmax-wasmsimd-x86.c
+tools/xngen src/f32-gemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=RELU -o src/f32-gemm/gen/4x2c4-relu-wasmsimd-arm.c
+tools/xngen src/f32-gemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=1 -D ACTIVATION=RELU -o src/f32-gemm/gen/4x2c4-relu-wasmsimd-x86.c
+tools/xngen src/f32-gemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=LINEAR -o src/f32-gemm/gen/4x2c4-wasmsimd.c
+
################################### x86 SSE ###################################
### LOAD1+BROADCAST micro-kernels
tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8-minmax-sse-load1.c
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
index 58f9b9d..defa685 100755
--- a/scripts/generate-f32-igemm.sh
+++ b/scripts/generate-f32-igemm.sh
@@ -87,6 +87,13 @@
tools/xngen src/f32-igemm/wasmsimd-s4.c.in -D MR=5 -D NR=8 -D X86=1 -D ACTIVATION=MINMAX -o src/f32-igemm/gen/5x8s4-minmax-wasmsimd-x86.c
tools/xngen src/f32-igemm/wasmsimd-s4.c.in -D MR=6 -D NR=8 -D X86=1 -D ACTIVATION=MINMAX -o src/f32-igemm/gen/6x8s4-minmax-wasmsimd-x86.c
+### MRx2 micro-kernels
+tools/xngen src/f32-igemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=MINMAX -o src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c
+tools/xngen src/f32-igemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=1 -D ACTIVATION=MINMAX -o src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c
+tools/xngen src/f32-igemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=RELU -o src/f32-igemm/gen/4x2c4-relu-wasmsimd-arm.c
+tools/xngen src/f32-igemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=1 -D ACTIVATION=RELU -o src/f32-igemm/gen/4x2c4-relu-wasmsimd-x86.c
+tools/xngen src/f32-igemm/MRx2c4-wasmsimd.c.in -D MR=4 -D NR=2 -D X86=0 -D ACTIVATION=LINEAR -o src/f32-igemm/gen/4x2c4-wasmsimd.c
+
############################### AArch64 assembly ##############################
# Cortex A75 / A57 micro-kernels
tools/xngen src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in -D INC=0 -D PREFETCH=0 -o src/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemm/MRx2c4-wasmsimd.c.in b/src/f32-gemm/MRx2c4-wasmsimd.c.in
new file mode 100644
index 0000000..ae48366
--- /dev/null
+++ b/src/f32-gemm/MRx2c4-wasmsimd.c.in
@@ -0,0 +1,161 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR == 2
+$assert MR % 2 == 0
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+$assert ACTIVATION in ["LINEAR", "RELU", "MINMAX"]
+$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower())
+$ARCH_SUFFIX = "" if ACTIVATION == "LINEAR" else "_x86" if X86 else "_arm"
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+void xnn_f32_gemm${ACTIVATION_SUFFIX}_ukernel_${MR}x${NR}c4__wasmsimd${ARCH_SUFFIX}(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ $for M in range(1, MR):
+ const float* a${M} = (const float*) ((uintptr_t) a${M-1} + a_stride);
+ float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+
+ $if ACTIVATION == "MINMAX" and not X86:
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ $for N in range(1, NR):
+ v128_t vacc0x${N}c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[${N}]);
+ $for M in range(1, MR):
+ $for N in range(NR):
+ v128_t vacc${M}x${N}c4 = vacc0x${N}c4;
+ w += ${NR};
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ $for M in range(MR):
+ const v128_t va${M} = wasm_v128_load(a${M});
+ a${M} += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ $for N in range(1, NR):
+ const v128_t vb${N} = wasm_v128_load(w + ${N * 4});
+ w += ${NR * 4};
+
+ $for M in range(MR):
+ $for N in range(NR):
+ vacc${M}x${N}c4 = wasm_f32x4_add(vacc${M}x${N}c4, wasm_f32x4_mul(va${M}, vb${N}));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ $for M in range(MR):
+ const v128_t va${M} = wasm_v128_load(a${M});
+ a${M} = (const float*) ((uintptr_t) a${M} + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ $for N in range(1, NR):
+ const v128_t vb${N} = wasm_v128_load(w + ${N * 4});
+ w += ${NR * 4};
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ $for N in range(NR):
+ const v128_t vmask${N} = wasm_f32x4_eq(vb${N}, vzero);
+
+ $for M in range(MR):
+ $for N in range(NR):
+ vacc${M}x${N}c4 = wasm_f32x4_add(vacc${M}x${N}c4, wasm_f32x4_mul(wasm_v128_andnot(va${M}, vmask${N}), vb${N}));
+ }
+
+ $for M in range(MR):
+ const v128_t vacc${M}x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc${M}x0c4, vacc${M}x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc${M}x0c4, vacc${M}x1c4, 2, 6, 3, 7));
+
+ $for M in range(0, MR, 2):
+ v128_t vacc${M}${M+1}x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc${M}x01c2, vacc${M+1}x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc${M}x01c2, vacc${M+1}x01c2, 2, 3, 6, 7));
+
+ $if ACTIVATION == "MINMAX":
+ $if X86:
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_bitselect(vmin, vacc${M}${M+1}x01, wasm_f32x4_lt(vacc${M}${M+1}x01, vmin));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_max(vacc${M}${M+1}x01, vmin);
+
+ $if X86:
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_bitselect(vacc${M}${M+1}x01, vmax, wasm_f32x4_le(vacc${M}${M+1}x01, vmax));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_min(vacc${M}${M+1}x01, vmax);
+ $elif ACTIVATION == "RELU":
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ $if X86:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_andnot(vacc${M}${M+1}x01, wasm_f32x4_le(vacc${M}${M+1}x01, vzero));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_max(vacc${M}${M+1}x01, vzero);
+
+ if XNN_LIKELY(nc >= ${NR}) {
+ $for M in reversed(range(0, MR, 2)):
+ *((double*) c${M}) = wasm_f64x2_extract_lane(vacc${M}${M+1}x01, 0);
+ c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+ a${M} = (const float*) ((uintptr_t) a${M} - kc);
+ *((double*) c${M+1}) = wasm_f64x2_extract_lane(vacc${M}${M+1}x01, 1);
+ c${M+1} = (float*) ((uintptr_t) c${M+1} + cn_stride);
+ a${M+1} = (const float*) ((uintptr_t) a${M+1} - kc);
+
+ nc -= ${NR};
+ } else {
+ assert(nc == 1);
+ $for M in reversed(range(0, MR, 2)):
+ *c${M} = wasm_f32x4_extract_lane(vacc${M}${M+1}x01, 0);
+ *c${M+1} = wasm_f32x4_extract_lane(vacc${M}${M+1}x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-arm.c b/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-arm.c
new file mode 100644
index 0000000..b755489
--- /dev/null
+++ b/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-arm.c
@@ -0,0 +1,175 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 = (const float*) ((uintptr_t) a0 + k);
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 = (const float*) ((uintptr_t) a1 + k);
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 = (const float*) ((uintptr_t) a2 + k);
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 = (const float*) ((uintptr_t) a3 + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ vacc01x01 = wasm_f32x4_max(vacc01x01, vmin);
+ vacc23x01 = wasm_f32x4_max(vacc23x01, vmin);
+
+ vacc01x01 = wasm_f32x4_min(vacc01x01, vmax);
+ vacc23x01 = wasm_f32x4_min(vacc23x01, vmax);
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-x86.c b/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-x86.c
new file mode 100644
index 0000000..261de9b
--- /dev/null
+++ b/src/f32-gemm/gen/4x2c4-minmax-wasmsimd-x86.c
@@ -0,0 +1,175 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 = (const float*) ((uintptr_t) a0 + k);
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 = (const float*) ((uintptr_t) a1 + k);
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 = (const float*) ((uintptr_t) a2 + k);
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 = (const float*) ((uintptr_t) a3 + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ vacc01x01 = wasm_v128_bitselect(vmin, vacc01x01, wasm_f32x4_lt(vacc01x01, vmin));
+ vacc23x01 = wasm_v128_bitselect(vmin, vacc23x01, wasm_f32x4_lt(vacc23x01, vmin));
+
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ vacc01x01 = wasm_v128_bitselect(vacc01x01, vmax, wasm_f32x4_le(vacc01x01, vmax));
+ vacc23x01 = wasm_v128_bitselect(vacc23x01, vmax, wasm_f32x4_le(vacc23x01, vmax));
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x2c4-relu-wasmsimd-arm.c b/src/f32-gemm/gen/4x2c4-relu-wasmsimd-arm.c
new file mode 100644
index 0000000..e733513
--- /dev/null
+++ b/src/f32-gemm/gen/4x2c4-relu-wasmsimd-arm.c
@@ -0,0 +1,171 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 = (const float*) ((uintptr_t) a0 + k);
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 = (const float*) ((uintptr_t) a1 + k);
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 = (const float*) ((uintptr_t) a2 + k);
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 = (const float*) ((uintptr_t) a3 + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ vacc01x01 = wasm_f32x4_max(vacc01x01, vzero);
+ vacc23x01 = wasm_f32x4_max(vacc23x01, vzero);
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x2c4-relu-wasmsimd-x86.c b/src/f32-gemm/gen/4x2c4-relu-wasmsimd-x86.c
new file mode 100644
index 0000000..e00d608
--- /dev/null
+++ b/src/f32-gemm/gen/4x2c4-relu-wasmsimd-x86.c
@@ -0,0 +1,171 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 = (const float*) ((uintptr_t) a0 + k);
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 = (const float*) ((uintptr_t) a1 + k);
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 = (const float*) ((uintptr_t) a2 + k);
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 = (const float*) ((uintptr_t) a3 + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ vacc01x01 = wasm_v128_andnot(vacc01x01, wasm_f32x4_le(vacc01x01, vzero));
+ vacc23x01 = wasm_v128_andnot(vacc23x01, wasm_f32x4_le(vacc23x01, vzero));
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-gemm/gen/4x2c4-wasmsimd.c b/src/f32-gemm/gen/4x2c4-wasmsimd.c
new file mode 100644
index 0000000..0d3d560
--- /dev/null
+++ b/src/f32-gemm/gen/4x2c4-wasmsimd.c
@@ -0,0 +1,168 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_f32_gemm_ukernel_4x2c4__wasmsimd(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const float* restrict a,
+ size_t a_stride,
+ const float* restrict w,
+ float* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const float* a0 = a;
+ float* c0 = c;
+ const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 = (const float*) ((uintptr_t) a0 + k);
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 = (const float*) ((uintptr_t) a1 + k);
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 = (const float*) ((uintptr_t) a2 + k);
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 = (const float*) ((uintptr_t) a3 + k);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ a2 = (const float*) ((uintptr_t) a2 - kc);
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ a3 = (const float*) ((uintptr_t) a3 - kc);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const float*) ((uintptr_t) a0 - kc);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ a1 = (const float*) ((uintptr_t) a1 - kc);
+
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/MRx2c4-wasmsimd.c.in b/src/f32-igemm/MRx2c4-wasmsimd.c.in
new file mode 100644
index 0000000..3ad7505
--- /dev/null
+++ b/src/f32-igemm/MRx2c4-wasmsimd.c.in
@@ -0,0 +1,171 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert NR == 2
+$assert MR % 2 == 0
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+$assert ACTIVATION in ["LINEAR", "RELU", "MINMAX"]
+$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower())
+$ARCH_SUFFIX = "" if ACTIVATION == "LINEAR" else "_x86" if X86 else "_arm"
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+void xnn_f32_igemm${ACTIVATION_SUFFIX}_ukernel_${MR}x${NR}c4__wasmsimd${ARCH_SUFFIX}(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (${MR} * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ $for M in range(1, MR):
+ float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ c${M} = c${M-1};
+ }
+
+ $if ACTIVATION == "MINMAX" and not X86:
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ $for N in range(1, NR):
+ v128_t vacc0x${N}c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[${N}]);
+ $for M in range(1, MR):
+ $for N in range(NR):
+ v128_t vacc${M}x${N}c4 = vacc0x${N}c4;
+ w += ${NR};
+
+ size_t p = ks;
+ do {
+ $for M in range(MR):
+ const float* restrict a${M} = a[${M}];
+ assert(a${M} != NULL);
+ if XNN_UNPREDICTABLE(a${M} != zero) {
+ a${M} = (const float*) ((uintptr_t) a${M} + a_offset);
+ }
+ a += ${MR};
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ $for M in range(MR):
+ const v128_t va${M} = wasm_v128_load(a${M});
+ a${M} += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ $for N in range(1, NR):
+ const v128_t vb${N} = wasm_v128_load(w + ${N * 4});
+ w += ${NR * 4};
+
+ $for M in range(MR):
+ $for N in range(NR):
+ vacc${M}x${N}c4 = wasm_f32x4_add(vacc${M}x${N}c4, wasm_f32x4_mul(va${M}, vb${N}));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ $for M in range(MR):
+ const v128_t va${M} = wasm_v128_load(a${M});
+
+ const v128_t vb0 = wasm_v128_load(w);
+ $for N in range(1, NR):
+ const v128_t vb${N} = wasm_v128_load(w + ${N * 4});
+ w += ${NR * 4};
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ $for N in range(NR):
+ const v128_t vmask${N} = wasm_f32x4_eq(vb${N}, vzero);
+
+ $for M in range(MR):
+ $for N in range(NR):
+ vacc${M}x${N}c4 = wasm_f32x4_add(vacc${M}x${N}c4, wasm_f32x4_mul(wasm_v128_andnot(va${M}, vmask${N}), vb${N}));
+ }
+ p -= ${MR} * sizeof(void*);
+ } while (p != 0);
+
+ $for M in range(MR):
+ const v128_t vacc${M}x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc${M}x0c4, vacc${M}x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc${M}x0c4, vacc${M}x1c4, 2, 6, 3, 7));
+
+ $for M in range(0, MR, 2):
+ v128_t vacc${M}${M+1}x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc${M}x01c2, vacc${M+1}x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc${M}x01c2, vacc${M+1}x01c2, 2, 3, 6, 7));
+
+ $if ACTIVATION == "MINMAX":
+ $if X86:
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_bitselect(vmin, vacc${M}${M+1}x01, wasm_f32x4_lt(vacc${M}${M+1}x01, vmin));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_max(vacc${M}${M+1}x01, vmin);
+
+ $if X86:
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_bitselect(vacc${M}${M+1}x01, vmax, wasm_f32x4_le(vacc${M}${M+1}x01, vmax));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_min(vacc${M}${M+1}x01, vmax);
+ $elif ACTIVATION == "RELU":
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ $if X86:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_v128_andnot(vacc${M}${M+1}x01, wasm_f32x4_le(vacc${M}${M+1}x01, vzero));
+ $else:
+ $for M in range(0, MR, 2):
+ vacc${M}${M+1}x01 = wasm_f32x4_max(vacc${M}${M+1}x01, vzero);
+
+ if XNN_LIKELY(nc >= ${NR}) {
+ $for M in reversed(range(0, MR, 2)):
+ *((double*) c${M+1}) = wasm_f64x2_extract_lane(vacc${M}${M+1}x01, 1);
+ c${M+1} = (float*) ((uintptr_t) c${M+1} + cn_stride);
+ *((double*) c${M}) = wasm_f64x2_extract_lane(vacc${M}${M+1}x01, 0);
+ c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= ${NR};
+ } else {
+ assert(nc == 1);
+ $for M in reversed(range(0, MR, 2)):
+ *c${M+1} = wasm_f32x4_extract_lane(vacc${M}${M+1}x01, 2);
+ *c${M} = wasm_f32x4_extract_lane(vacc${M}${M+1}x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c b/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c
new file mode 100644
index 0000000..f494509
--- /dev/null
+++ b/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c
@@ -0,0 +1,192 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ const v128_t va1 = wasm_v128_load(a1);
+ const v128_t va2 = wasm_v128_load(a2);
+ const v128_t va3 = wasm_v128_load(a3);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ vacc01x01 = wasm_f32x4_max(vacc01x01, vmin);
+ vacc23x01 = wasm_f32x4_max(vacc23x01, vmin);
+
+ vacc01x01 = wasm_f32x4_min(vacc01x01, vmax);
+ vacc23x01 = wasm_f32x4_min(vacc23x01, vmax);
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c b/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c
new file mode 100644
index 0000000..8bd83c4
--- /dev/null
+++ b/src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c
@@ -0,0 +1,192 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ const v128_t va1 = wasm_v128_load(a1);
+ const v128_t va2 = wasm_v128_load(a2);
+ const v128_t va3 = wasm_v128_load(a3);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
+ vacc01x01 = wasm_v128_bitselect(vmin, vacc01x01, wasm_f32x4_lt(vacc01x01, vmin));
+ vacc23x01 = wasm_v128_bitselect(vmin, vacc23x01, wasm_f32x4_lt(vacc23x01, vmin));
+
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ vacc01x01 = wasm_v128_bitselect(vacc01x01, vmax, wasm_f32x4_le(vacc01x01, vmax));
+ vacc23x01 = wasm_v128_bitselect(vacc23x01, vmax, wasm_f32x4_le(vacc23x01, vmax));
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x2c4-relu-wasmsimd-arm.c b/src/f32-igemm/gen/4x2c4-relu-wasmsimd-arm.c
new file mode 100644
index 0000000..52a0950
--- /dev/null
+++ b/src/f32-igemm/gen/4x2c4-relu-wasmsimd-arm.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ const v128_t va1 = wasm_v128_load(a1);
+ const v128_t va2 = wasm_v128_load(a2);
+ const v128_t va3 = wasm_v128_load(a3);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ vacc01x01 = wasm_f32x4_max(vacc01x01, vzero);
+ vacc23x01 = wasm_f32x4_max(vacc23x01, vzero);
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x2c4-relu-wasmsimd-x86.c b/src/f32-igemm/gen/4x2c4-relu-wasmsimd-x86.c
new file mode 100644
index 0000000..6f4c8cb
--- /dev/null
+++ b/src/f32-igemm/gen/4x2c4-relu-wasmsimd-x86.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ const v128_t va1 = wasm_v128_load(a1);
+ const v128_t va2 = wasm_v128_load(a2);
+ const v128_t va3 = wasm_v128_load(a3);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ vacc01x01 = wasm_v128_andnot(vacc01x01, wasm_f32x4_le(vacc01x01, vzero));
+ vacc23x01 = wasm_v128_andnot(vacc23x01, wasm_f32x4_le(vacc23x01, vzero));
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/f32-igemm/gen/4x2c4-wasmsimd.c b/src/f32-igemm/gen/4x2c4-wasmsimd.c
new file mode 100644
index 0000000..831328b
--- /dev/null
+++ b/src/f32-igemm/gen/4x2c4-wasmsimd.c
@@ -0,0 +1,185 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/MRx2c4-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_f32_igemm_ukernel_4x2c4__wasmsimd(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const float**restrict a,
+ const float*restrict w,
+ float*restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const float* zero,
+ const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(float) == 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(float) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ float* c0 = c;
+ float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ v128_t vacc0x0c4 = wasm_f32x4_replace_lane(wasm_f32x4_splat(0.0f), 0, w[0]);
+ v128_t vacc0x1c4 = wasm_f32x4_replace_lane(vacc0x0c4, 0, w[1]);
+ v128_t vacc1x0c4 = vacc0x0c4;
+ v128_t vacc1x1c4 = vacc0x1c4;
+ v128_t vacc2x0c4 = vacc0x0c4;
+ v128_t vacc2x1c4 = vacc0x1c4;
+ v128_t vacc3x0c4 = vacc0x0c4;
+ v128_t vacc3x1c4 = vacc0x1c4;
+ w += 2;
+
+ size_t p = ks;
+ do {
+ const float* restrict a0 = a[0];
+ assert(a0 != NULL);
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const float*) ((uintptr_t) a0 + a_offset);
+ }
+ const float* restrict a1 = a[1];
+ assert(a1 != NULL);
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const float*) ((uintptr_t) a1 + a_offset);
+ }
+ const float* restrict a2 = a[2];
+ assert(a2 != NULL);
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const float*) ((uintptr_t) a2 + a_offset);
+ }
+ const float* restrict a3 = a[3];
+ assert(a3 != NULL);
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const float*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
+ const v128_t va0 = wasm_v128_load(a0);
+ a0 += 4;
+ const v128_t va1 = wasm_v128_load(a1);
+ a1 += 4;
+ const v128_t va2 = wasm_v128_load(a2);
+ a2 += 4;
+ const v128_t va3 = wasm_v128_load(a3);
+ a3 += 4;
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(va0, vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(va0, vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(va1, vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(va1, vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(va2, vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(va2, vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(va3, vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(va3, vb1));
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const v128_t va0 = wasm_v128_load(a0);
+ const v128_t va1 = wasm_v128_load(a1);
+ const v128_t va2 = wasm_v128_load(a2);
+ const v128_t va3 = wasm_v128_load(a3);
+
+ const v128_t vb0 = wasm_v128_load(w);
+ const v128_t vb1 = wasm_v128_load(w + 4);
+ w += 8;
+
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+ const v128_t vmask0 = wasm_f32x4_eq(vb0, vzero);
+ const v128_t vmask1 = wasm_f32x4_eq(vb1, vzero);
+
+ vacc0x0c4 = wasm_f32x4_add(vacc0x0c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask0), vb0));
+ vacc0x1c4 = wasm_f32x4_add(vacc0x1c4, wasm_f32x4_mul(wasm_v128_andnot(va0, vmask1), vb1));
+ vacc1x0c4 = wasm_f32x4_add(vacc1x0c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask0), vb0));
+ vacc1x1c4 = wasm_f32x4_add(vacc1x1c4, wasm_f32x4_mul(wasm_v128_andnot(va1, vmask1), vb1));
+ vacc2x0c4 = wasm_f32x4_add(vacc2x0c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask0), vb0));
+ vacc2x1c4 = wasm_f32x4_add(vacc2x1c4, wasm_f32x4_mul(wasm_v128_andnot(va2, vmask1), vb1));
+ vacc3x0c4 = wasm_f32x4_add(vacc3x0c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask0), vb0));
+ vacc3x1c4 = wasm_f32x4_add(vacc3x1c4, wasm_f32x4_mul(wasm_v128_andnot(va3, vmask1), vb1));
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const v128_t vacc0x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc0x0c4, vacc0x1c4, 2, 6, 3, 7));
+ const v128_t vacc1x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc1x0c4, vacc1x1c4, 2, 6, 3, 7));
+ const v128_t vacc2x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc2x0c4, vacc2x1c4, 2, 6, 3, 7));
+ const v128_t vacc3x01c2 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 0, 4, 1, 5),
+ wasm_v32x4_shuffle(vacc3x0c4, vacc3x1c4, 2, 6, 3, 7));
+
+ v128_t vacc01x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc0x01c2, vacc1x01c2, 2, 3, 6, 7));
+ v128_t vacc23x01 = wasm_f32x4_add(
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 0, 1, 4, 5),
+ wasm_v32x4_shuffle(vacc2x01c2, vacc3x01c2, 2, 3, 6, 7));
+
+
+ if XNN_LIKELY(nc >= 2) {
+ *((double*) c3) = wasm_f64x2_extract_lane(vacc23x01, 1);
+ c3 = (float*) ((uintptr_t) c3 + cn_stride);
+ *((double*) c2) = wasm_f64x2_extract_lane(vacc23x01, 0);
+ c2 = (float*) ((uintptr_t) c2 + cn_stride);
+ *((double*) c1) = wasm_f64x2_extract_lane(vacc01x01, 1);
+ c1 = (float*) ((uintptr_t) c1 + cn_stride);
+ *((double*) c0) = wasm_f64x2_extract_lane(vacc01x01, 0);
+ c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const float**restrict) ((uintptr_t) a - ks);
+ nc -= 2;
+ } else {
+ assert(nc == 1);
+ *c3 = wasm_f32x4_extract_lane(vacc23x01, 2);
+ *c2 = wasm_f32x4_extract_lane(vacc23x01, 0);
+ *c1 = wasm_f32x4_extract_lane(vacc01x01, 2);
+ *c0 = wasm_f32x4_extract_lane(vacc01x01, 0);
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/init.c b/src/init.c
index 561b336..981214a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1724,6 +1724,14 @@
xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
xnn_params.f32.gemm.mr = 4;
xnn_params.f32.gemm.nr = 8;
+
+ xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ xnn_params.f32.gemm2.mr = 4;
+ xnn_params.f32.gemm2.nr = 2;
+ xnn_params.f32.gemm2.log2_kr = 2;
} else {
xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_splat_arm);
xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_splat_arm);
@@ -1739,12 +1747,15 @@
xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
xnn_params.f32.gemm.mr = 5;
xnn_params.f32.gemm.nr = 8;
+
+ xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ xnn_params.f32.gemm2.mr = 4;
+ xnn_params.f32.gemm2.nr = 2;
+ xnn_params.f32.gemm2.log2_kr = 2;
}
- xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__psimd);
- xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__psimd);
- xnn_params.f32.gemm2.mr = 4;
- xnn_params.f32.gemm2.nr = 2;
- xnn_params.f32.gemm2.log2_kr = 2;
if (is_wasm_x86) {
xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 255231e..7494cfb 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -232,6 +232,14 @@
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2c4__wasmsimd)
+
+DECLARE_F32_GEMM_RELU_UKERNEL_FUNCTION(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm)
+DECLARE_F32_GEMM_RELU_UKERNEL_FUNCTION(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86)
+
+DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm)
+DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86)
+
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__wasm)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__wasm)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__wasm)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 7c5223a..f95f665 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -231,6 +231,14 @@
DECLARE_F32_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86)
DECLARE_F32_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__wasmsimd)
+
+DECLARE_F32_IGEMM_RELU_UKERNEL_FUNCTION(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm)
+DECLARE_F32_IGEMM_RELU_UKERNEL_FUNCTION(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86)
+
+DECLARE_F32_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm)
+DECLARE_F32_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86)
+
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__wasm)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__wasm)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__wasm)
diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index 9d6595c..652c11a 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc
@@ -57625,6 +57625,864 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4_strided_a) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4_strided_a) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4_strided_a) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_strided_a) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_strided_a) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmin) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmax) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4_strided_a) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4_strided_a) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4_strided_a) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_strided_a) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_strided_a) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, qmin) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, qmax) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_GEMM_MINMAX_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-gemm-minmax.yaml b/test/f32-gemm-minmax.yaml
index 3761df1..7bbfcb1 100644
--- a/test/f32-gemm-minmax.yaml
+++ b/test/f32-gemm-minmax.yaml
@@ -332,6 +332,10 @@
k-block: 4
- name: xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86
k-block: 4
+- name: xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm
+ k-block: 4
+- name: xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86
+ k-block: 4
- name: xnn_f32_gemm_minmax_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_gemm_minmax_ukernel_2x4__wasm
diff --git a/test/f32-gemm-relu.cc b/test/f32-gemm-relu.cc
index c37dcee..2bc5dd8 100644
--- a/test/f32-gemm-relu.cc
+++ b/test/f32-gemm-relu.cc
@@ -1634,6 +1634,812 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_lt_4_strided_a) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_gt_4_strided_a) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_div_4_strided_a) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_strided_a) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_strided_a) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_lt_4_strided_a) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_gt_4_strided_a) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_div_4_strided_a) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_strided_a) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_strided_a) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_RELU_4X2C4__WASMSIMD_X86, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_GEMM_RELU_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-gemm-relu.yaml b/test/f32-gemm-relu.yaml
index 3cd8a35..fe62cb5 100644
--- a/test/f32-gemm-relu.yaml
+++ b/test/f32-gemm-relu.yaml
@@ -10,6 +10,10 @@
k-block: 4
- name: xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat_x86
k-block: 4
+- name: xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_arm
+ k-block: 4
+- name: xnn_f32_gemm_relu_ukernel_4x2c4__wasmsimd_x86
+ k-block: 4
- name: xnn_f32_gemm_relu_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_gemm_relu_ukernel_2x4__wasm
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index b77544e..8754f7e 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -1634,6 +1634,409 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_eq_4_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_lt_4_strided_a) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_gt_4_strided_a) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_div_4_strided_a) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_gt_2_strided_a) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_div_2_strided_a) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X2C4__WASMSIMD, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_GEMM_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index 44d0810..1221830 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -11,6 +11,8 @@
k-block: 4
- name: xnn_f32_gemm_ukernel_5x8__wasmsimd_splat
k-block: 4
+- name: xnn_f32_gemm_ukernel_4x2c4__wasmsimd
+ k-block: 4
- name: xnn_f32_gemm_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_gemm_ukernel_2x4__wasm
diff --git a/test/f32-igemm-minmax.cc b/test/f32-igemm-minmax.cc
index fc5e150..78627df 100644
--- a/test/f32-igemm-minmax.cc
+++ b/test/f32-igemm-minmax.cc
@@ -57912,6 +57912,888 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, small_kernel) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, small_kernel_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_small_kernel) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_small_kernel) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, a_offset) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, zero) {
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmin) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmax) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, small_kernel) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, small_kernel_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_small_kernel) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_small_kernel) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, a_offset) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, zero) {
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, qmin) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, qmax) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_IGEMM_MINMAX_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-igemm-minmax.yaml b/test/f32-igemm-minmax.yaml
index 262d3af..8274224 100644
--- a/test/f32-igemm-minmax.yaml
+++ b/test/f32-igemm-minmax.yaml
@@ -320,6 +320,10 @@
k-block: 4
- name: xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86
k-block: 4
+- name: xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm
+ k-block: 4
+- name: xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86
+ k-block: 4
- name: xnn_f32_igemm_minmax_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_igemm_minmax_ukernel_2x4__wasm
diff --git a/test/f32-igemm-relu.cc b/test/f32-igemm-relu.cc
index e6b5f0d..53e2795 100644
--- a/test/f32-igemm-relu.cc
+++ b/test/f32-igemm-relu.cc
@@ -1682,6 +1682,836 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, small_kernel) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, small_kernel_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_gt_2_small_kernel) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, n_div_2_small_kernel) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, a_offset) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, zero) {
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_ARM, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, small_kernel) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, small_kernel_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_gt_2_small_kernel) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, n_div_2_small_kernel) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, a_offset) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, zero) {
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_RELU_4X2C4__WASMSIMD_X86, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_IGEMM_RELU_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-igemm-relu.yaml b/test/f32-igemm-relu.yaml
index 73ecd41..c4aa5b7 100644
--- a/test/f32-igemm-relu.yaml
+++ b/test/f32-igemm-relu.yaml
@@ -10,6 +10,10 @@
k-block: 4
- name: xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat_x86
k-block: 4
+- name: xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_arm
+ k-block: 4
+- name: xnn_f32_igemm_relu_ukernel_4x2c4__wasmsimd_x86
+ k-block: 4
- name: xnn_f32_igemm_relu_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_igemm_relu_ukernel_2x4__wasm
diff --git a/test/f32-igemm.cc b/test/f32-igemm.cc
index b44e649..7376243 100644
--- a/test/f32-igemm.cc
+++ b/test/f32-igemm.cc
@@ -1267,6 +1267,421 @@
#endif // XNN_ARCH_WASMSIMD
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_eq_4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_eq_4_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_eq_4_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(2)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_eq_4_subtile_n) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_lt_4) {
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_lt_4_subtile) {
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_gt_4) {
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_gt_4_subtile) {
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_div_4) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, k_div_4_subtile) {
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_gt_2) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_gt_2_strided_cn) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_gt_2_subtile) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_div_2) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_div_2_strided_cn) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(5)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_div_2_subtile) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, small_kernel) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, small_kernel_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_gt_2_small_kernel) {
+ for (uint32_t n = 3; n < 4; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, n_div_2_small_kernel) {
+ for (uint32_t n = 4; n <= 6; n += 2) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, strided_cm_subtile) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 2; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(5)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, a_offset) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, zero) {
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X2C4__WASMSIMD, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(2)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(2)
+ .k(4)
+ .cm_stride(5)
+ .Test(xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_IGEMM_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-igemm.yaml b/test/f32-igemm.yaml
index 75f41ba..3de1001 100644
--- a/test/f32-igemm.yaml
+++ b/test/f32-igemm.yaml
@@ -8,6 +8,8 @@
k-block: 4
- name: xnn_f32_igemm_ukernel_5x8__wasmsimd_splat
k-block: 4
+- name: xnn_f32_igemm_ukernel_4x2c4__wasmsimd
+ k-block: 4
- name: xnn_f32_igemm_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_igemm_ukernel_2x4__wasm