WAsm SIMD versions of VMULCADDC microkernels

PiperOrigin-RevId: 321384656
diff --git a/BUILD.bazel b/BUILD.bazel
index fe66542..39b1d4c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -744,6 +744,10 @@
     "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x8.c",
+    "src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-arm-2x.c",
+    "src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-arm-2x.c",
+    "src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-x86-2x.c",
+    "src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-x86-2x.c",
     "src/f32-vrnd/gen/vrndne-wasmsimd-x4.c",
     "src/f32-vrnd/gen/vrndne-wasmsimd-x8.c",
     "src/f32-vrnd/gen/vrndz-wasmsimd-x4.c",
diff --git a/scripts/generate-f32-vmulcaddc.sh b/scripts/generate-f32-vmulcaddc.sh
index 7067582..ffebb1e 100755
--- a/scripts/generate-f32-vmulcaddc.sh
+++ b/scripts/generate-f32-vmulcaddc.sh
@@ -15,6 +15,13 @@
 tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=2 -D ROW_TILE=2 -D WASM=1 -o src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c
 tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D WASM=1 -o src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c
 
+################################## WAsm SIMD ##################################
+tools/xngen src/f32-vmulcaddc/wasmsimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D X86=0 -o src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-arm-2x.c
+tools/xngen src/f32-vmulcaddc/wasmsimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D X86=0 -o src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-arm-2x.c
+
+tools/xngen src/f32-vmulcaddc/wasmsimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D X86=1 -o src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-x86-2x.c
+tools/xngen src/f32-vmulcaddc/wasmsimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D X86=1 -o src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-x86-2x.c
+
 ################################### ARM NEON ##################################
 tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/gen/c4-minmax-neon-2x.c
 tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/gen/c8-minmax-neon-2x.c
diff --git a/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-arm-2x.c b/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-arm-2x.c
new file mode 100644
index 0000000..57cf448
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-arm-2x.c
@@ -0,0 +1,119 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const v128_t vscale0123 = wasm_v128_load(w);
+
+      v128_t vacc0x0123 = wasm_v128_load(i0);
+      i0 += 4;
+      v128_t vacc1x0123 = wasm_v128_load(i1);
+      i1 += 4;
+
+      const v128_t vbias0123 = wasm_v128_load(w + 4);
+
+      vacc0x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc0x0123));
+      vacc1x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc1x0123));
+
+      vacc0x0123 = wasm_f32x4_max(vacc0x0123, vmin);
+      vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin);
+
+      vacc0x0123 = wasm_f32x4_min(vacc0x0123, vmax);
+      vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax);
+
+      wasm_v128_store(o0, vacc0x0123);
+      o0 += 4;
+      wasm_v128_store(o1, vacc1x0123);
+      o1 += 4;
+
+      w += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 = (const float*) ((uintptr_t) i0 + c);
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 = (const float*) ((uintptr_t) i1 + c);
+
+      const v128_t vbias = wasm_v128_load(w + 4);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_f32x4_max(vacc0, vmin);
+      vacc1 = wasm_f32x4_max(vacc1, vmin);
+
+      vacc0 = wasm_f32x4_min(vacc0, vmax);
+      vacc1 = wasm_f32x4_min(vacc1, vmax);
+
+      if (c & (2 * sizeof(float))) {
+        *((double*) o0) = wasm_f64x2_extract_lane(vacc0, 0);
+        *((double*) o1) = wasm_f64x2_extract_lane(vacc1, 0);
+
+        vacc0 = wasm_v32x4_shuffle(vacc0, vacc0, 2, 3, 2, 3);
+        vacc1 = wasm_v32x4_shuffle(vacc1, vacc1, 2, 3, 2, 3);
+
+        o0 += 2;
+        o1 += 2;
+      }
+      if (c & (1 * sizeof(float))) {
+        *o0++ = wasm_f32x4_extract_lane(vacc0, 0);
+        *o1++ = wasm_f32x4_extract_lane(vacc1, 0);
+      }
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}
diff --git a/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-x86-2x.c b/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-x86-2x.c
new file mode 100644
index 0000000..1a32dd4
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c4-minmax-wasmsimd-x86-2x.c
@@ -0,0 +1,119 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const v128_t vscale0123 = wasm_v128_load(w);
+
+      v128_t vacc0x0123 = wasm_v128_load(i0);
+      i0 += 4;
+      v128_t vacc1x0123 = wasm_v128_load(i1);
+      i1 += 4;
+
+      const v128_t vbias0123 = wasm_v128_load(w + 4);
+
+      vacc0x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc0x0123));
+      vacc1x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc1x0123));
+
+      vacc0x0123 = wasm_v128_bitselect(vmin, vacc0x0123, wasm_f32x4_lt(vacc0x0123, vmin));
+      vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin));
+
+      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vmax, wasm_f32x4_le(vacc0x0123, vmax));
+      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax));
+
+      wasm_v128_store(o0, vacc0x0123);
+      o0 += 4;
+      wasm_v128_store(o1, vacc1x0123);
+      o1 += 4;
+
+      w += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 = (const float*) ((uintptr_t) i0 + c);
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 = (const float*) ((uintptr_t) i1 + c);
+
+      const v128_t vbias = wasm_v128_load(w + 4);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_v128_bitselect(vmin, vacc0, wasm_f32x4_lt(vacc0, vmin));
+      vacc1 = wasm_v128_bitselect(vmin, vacc1, wasm_f32x4_lt(vacc1, vmin));
+
+      vacc0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
+      vacc1 = wasm_v128_bitselect(vacc1, vmax, wasm_f32x4_le(vacc1, vmax));
+
+      if (c & (2 * sizeof(float))) {
+        *((double*) o0) = wasm_f64x2_extract_lane(vacc0, 0);
+        *((double*) o1) = wasm_f64x2_extract_lane(vacc1, 0);
+
+        vacc0 = wasm_v32x4_shuffle(vacc0, vacc0, 2, 3, 2, 3);
+        vacc1 = wasm_v32x4_shuffle(vacc1, vacc1, 2, 3, 2, 3);
+
+        o0 += 2;
+        o1 += 2;
+      }
+      if (c & (1 * sizeof(float))) {
+        *o0++ = wasm_f32x4_extract_lane(vacc0, 0);
+        *o1++ = wasm_f32x4_extract_lane(vacc1, 0);
+      }
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}
diff --git a/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-arm-2x.c b/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-arm-2x.c
new file mode 100644
index 0000000..a052cfc
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-arm-2x.c
@@ -0,0 +1,157 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
+      const v128_t vscale0123 = wasm_v128_load(w);
+      const v128_t vscale4567 = wasm_v128_load(w + 4);
+
+      v128_t vacc0x0123 = wasm_v128_load(i0);
+      v128_t vacc0x4567 = wasm_v128_load(i0 + 4);
+      i0 += 8;
+      v128_t vacc1x0123 = wasm_v128_load(i1);
+      v128_t vacc1x4567 = wasm_v128_load(i1 + 4);
+      i1 += 8;
+
+      const v128_t vbias0123 = wasm_v128_load(w + 8);
+      const v128_t vbias4567 = wasm_v128_load(w + 12);
+
+      vacc0x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc0x0123));
+      vacc0x4567 = wasm_f32x4_add(vbias4567, wasm_f32x4_mul(vscale4567, vacc0x4567));
+      vacc1x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc1x0123));
+      vacc1x4567 = wasm_f32x4_add(vbias4567, wasm_f32x4_mul(vscale4567, vacc1x4567));
+
+      vacc0x0123 = wasm_f32x4_max(vacc0x0123, vmin);
+      vacc0x4567 = wasm_f32x4_max(vacc0x4567, vmin);
+      vacc1x0123 = wasm_f32x4_max(vacc1x0123, vmin);
+      vacc1x4567 = wasm_f32x4_max(vacc1x4567, vmin);
+
+      vacc0x0123 = wasm_f32x4_min(vacc0x0123, vmax);
+      vacc0x4567 = wasm_f32x4_min(vacc0x4567, vmax);
+      vacc1x0123 = wasm_f32x4_min(vacc1x0123, vmax);
+      vacc1x4567 = wasm_f32x4_min(vacc1x4567, vmax);
+
+      wasm_v128_store(o0, vacc0x0123);
+      wasm_v128_store(o0 + 4, vacc0x4567);
+      o0 += 8;
+      wasm_v128_store(o1, vacc1x0123);
+      wasm_v128_store(o1 + 4, vacc1x4567);
+      o1 += 8;
+
+      w += 16;
+    }
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 += 4;
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 += 4;
+
+      const v128_t vbias = wasm_v128_load(w + 8);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_f32x4_max(vacc0, vmin);
+      vacc1 = wasm_f32x4_max(vacc1, vmin);
+
+      vacc0 = wasm_f32x4_min(vacc0, vmax);
+      vacc1 = wasm_f32x4_min(vacc1, vmax);
+
+      wasm_v128_store(o0, vacc0);
+      o0 += 4;
+      wasm_v128_store(o1, vacc1);
+      o1 += 4;
+
+      w += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 = (const float*) ((uintptr_t) i0 + c);
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 = (const float*) ((uintptr_t) i1 + c);
+
+      const v128_t vbias = wasm_v128_load(w + 8);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_f32x4_max(vacc0, vmin);
+      vacc1 = wasm_f32x4_max(vacc1, vmin);
+
+      vacc0 = wasm_f32x4_min(vacc0, vmax);
+      vacc1 = wasm_f32x4_min(vacc1, vmax);
+
+      if (c & (2 * sizeof(float))) {
+        *((double*) o0) = wasm_f64x2_extract_lane(vacc0, 0);
+        *((double*) o1) = wasm_f64x2_extract_lane(vacc1, 0);
+
+        vacc0 = wasm_v32x4_shuffle(vacc0, vacc0, 2, 3, 2, 3);
+        vacc1 = wasm_v32x4_shuffle(vacc1, vacc1, 2, 3, 2, 3);
+
+        o0 += 2;
+        o1 += 2;
+      }
+      if (c & (1 * sizeof(float))) {
+        *o0++ = wasm_f32x4_extract_lane(vacc0, 0);
+        *o1++ = wasm_f32x4_extract_lane(vacc1, 0);
+      }
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}
diff --git a/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-x86-2x.c b/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-x86-2x.c
new file mode 100644
index 0000000..0e855e8
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c8-minmax-wasmsimd-x86-2x.c
@@ -0,0 +1,157 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
+      const v128_t vscale0123 = wasm_v128_load(w);
+      const v128_t vscale4567 = wasm_v128_load(w + 4);
+
+      v128_t vacc0x0123 = wasm_v128_load(i0);
+      v128_t vacc0x4567 = wasm_v128_load(i0 + 4);
+      i0 += 8;
+      v128_t vacc1x0123 = wasm_v128_load(i1);
+      v128_t vacc1x4567 = wasm_v128_load(i1 + 4);
+      i1 += 8;
+
+      const v128_t vbias0123 = wasm_v128_load(w + 8);
+      const v128_t vbias4567 = wasm_v128_load(w + 12);
+
+      vacc0x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc0x0123));
+      vacc0x4567 = wasm_f32x4_add(vbias4567, wasm_f32x4_mul(vscale4567, vacc0x4567));
+      vacc1x0123 = wasm_f32x4_add(vbias0123, wasm_f32x4_mul(vscale0123, vacc1x0123));
+      vacc1x4567 = wasm_f32x4_add(vbias4567, wasm_f32x4_mul(vscale4567, vacc1x4567));
+
+      vacc0x0123 = wasm_v128_bitselect(vmin, vacc0x0123, wasm_f32x4_lt(vacc0x0123, vmin));
+      vacc0x4567 = wasm_v128_bitselect(vmin, vacc0x4567, wasm_f32x4_lt(vacc0x4567, vmin));
+      vacc1x0123 = wasm_v128_bitselect(vmin, vacc1x0123, wasm_f32x4_lt(vacc1x0123, vmin));
+      vacc1x4567 = wasm_v128_bitselect(vmin, vacc1x4567, wasm_f32x4_lt(vacc1x4567, vmin));
+
+      vacc0x0123 = wasm_v128_bitselect(vacc0x0123, vmax, wasm_f32x4_le(vacc0x0123, vmax));
+      vacc0x4567 = wasm_v128_bitselect(vacc0x4567, vmax, wasm_f32x4_le(vacc0x4567, vmax));
+      vacc1x0123 = wasm_v128_bitselect(vacc1x0123, vmax, wasm_f32x4_le(vacc1x0123, vmax));
+      vacc1x4567 = wasm_v128_bitselect(vacc1x4567, vmax, wasm_f32x4_le(vacc1x4567, vmax));
+
+      wasm_v128_store(o0, vacc0x0123);
+      wasm_v128_store(o0 + 4, vacc0x4567);
+      o0 += 8;
+      wasm_v128_store(o1, vacc1x0123);
+      wasm_v128_store(o1 + 4, vacc1x4567);
+      o1 += 8;
+
+      w += 16;
+    }
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 += 4;
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 += 4;
+
+      const v128_t vbias = wasm_v128_load(w + 8);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_v128_bitselect(vmin, vacc0, wasm_f32x4_lt(vacc0, vmin));
+      vacc1 = wasm_v128_bitselect(vmin, vacc1, wasm_f32x4_lt(vacc1, vmin));
+
+      vacc0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
+      vacc1 = wasm_v128_bitselect(vacc1, vmax, wasm_f32x4_le(vacc1, vmax));
+
+      wasm_v128_store(o0, vacc0);
+      o0 += 4;
+      wasm_v128_store(o1, vacc1);
+      o1 += 4;
+
+      w += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      v128_t vacc0 = wasm_v128_load(i0);
+      i0 = (const float*) ((uintptr_t) i0 + c);
+      v128_t vacc1 = wasm_v128_load(i1);
+      i1 = (const float*) ((uintptr_t) i1 + c);
+
+      const v128_t vbias = wasm_v128_load(w + 8);
+
+      vacc0 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc0));
+      vacc1 = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc1));
+
+      vacc0 = wasm_v128_bitselect(vmin, vacc0, wasm_f32x4_lt(vacc0, vmin));
+      vacc1 = wasm_v128_bitselect(vmin, vacc1, wasm_f32x4_lt(vacc1, vmin));
+
+      vacc0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
+      vacc1 = wasm_v128_bitselect(vacc1, vmax, wasm_f32x4_le(vacc1, vmax));
+
+      if (c & (2 * sizeof(float))) {
+        *((double*) o0) = wasm_f64x2_extract_lane(vacc0, 0);
+        *((double*) o1) = wasm_f64x2_extract_lane(vacc1, 0);
+
+        vacc0 = wasm_v32x4_shuffle(vacc0, vacc0, 2, 3, 2, 3);
+        vacc1 = wasm_v32x4_shuffle(vacc1, vacc1, 2, 3, 2, 3);
+
+        o0 += 2;
+        o1 += 2;
+      }
+      if (c & (1 * sizeof(float))) {
+        *o0++ = wasm_f32x4_extract_lane(vacc0, 0);
+        *o1++ = wasm_f32x4_extract_lane(vacc1, 0);
+      }
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}
diff --git a/src/f32-vmulcaddc/wasmsimd.c.in b/src/f32-vmulcaddc/wasmsimd.c.in
new file mode 100644
index 0000000..3dbe710
--- /dev/null
+++ b/src/f32-vmulcaddc/wasmsimd.c.in
@@ -0,0 +1,186 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 4 == 0
+$assert CHANNEL_TILE >= 4
+$assert ROW_TILE >= 1
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c${CHANNEL_TILE}__wasmsimd_${"x86" if X86 else "arm"}_${ROW_TILE}x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  $for M in range(1, ROW_TILE):
+    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_stride);
+    float* o${M} = (float*) ((uintptr_t) o${M-1} + output_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(rows <= ${M}) {
+        i${M} = i${M-1};
+        o${M} = o${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(rows < ${M+1}) {
+        i${M} = i${M-1};
+        o${M} = o${M-1};
+      }
+
+  const size_t input_increment = input_stride * ${ROW_TILE} - channels;
+  const size_t output_increment = output_stride * ${ROW_TILE} - channels;
+
+  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= ${CHANNEL_TILE} * sizeof(float); c -= ${CHANNEL_TILE} * sizeof(float)) {
+      const v128_t vscale${ABC[0:4]} = wasm_v128_load(w);
+      $for C in range(4, CHANNEL_TILE, 4):
+        const v128_t vscale${ABC[C:C+4]} = wasm_v128_load(w + ${C});
+
+      $for M in range(ROW_TILE):
+        v128_t vacc${M}x${ABC[0:4]} = wasm_v128_load(i${M});
+        $for C in range(4, CHANNEL_TILE, 4):
+          v128_t vacc${M}x${ABC[C:C+4]} = wasm_v128_load(i${M} + ${C});
+        i${M} += ${CHANNEL_TILE};
+
+      $for C in range(0, CHANNEL_TILE, 4):
+        const v128_t vbias${ABC[C:C+4]} = wasm_v128_load(w + ${C + CHANNEL_TILE});
+
+      $for M in range(ROW_TILE):
+        $for C in range(0, CHANNEL_TILE, 4):
+          vacc${M}x${ABC[C:C+4]} = wasm_f32x4_add(vbias${ABC[C:C+4]}, wasm_f32x4_mul(vscale${ABC[C:C+4]}, vacc${M}x${ABC[C:C+4]}));
+
+      $if X86:
+        $for M in range(ROW_TILE):
+          $for C in range(0, CHANNEL_TILE, 4):
+            vacc${M}x${ABC[C:C+4]} = wasm_v128_bitselect(vmin, vacc${M}x${ABC[C:C+4]}, wasm_f32x4_lt(vacc${M}x${ABC[C:C+4]}, vmin));
+
+        $for M in range(ROW_TILE):
+          $for C in range(0, CHANNEL_TILE, 4):
+            vacc${M}x${ABC[C:C+4]} = wasm_v128_bitselect(vacc${M}x${ABC[C:C+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[C:C+4]}, vmax));
+      $else:
+        $for M in range(ROW_TILE):
+          $for C in range(0, CHANNEL_TILE, 4):
+            vacc${M}x${ABC[C:C+4]} = wasm_f32x4_max(vacc${M}x${ABC[C:C+4]}, vmin);
+
+        $for M in range(ROW_TILE):
+          $for C in range(0, CHANNEL_TILE, 4):
+            vacc${M}x${ABC[C:C+4]} = wasm_f32x4_min(vacc${M}x${ABC[C:C+4]}, vmax);
+
+      $for M in range(ROW_TILE):
+        wasm_v128_store(o${M}, vacc${M}x${ABC[0:4]});
+        $for C in range(4, CHANNEL_TILE, 4):
+          wasm_v128_store(o${M} + ${C}, vacc${M}x${ABC[C:C+4]});
+        o${M} += ${CHANNEL_TILE};
+
+      w += ${CHANNEL_TILE * 2};
+    }
+    $if CHANNEL_TILE > 4:
+      for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+        const v128_t vscale = wasm_v128_load(w);
+
+        $for M in range(ROW_TILE):
+          v128_t vacc${M} = wasm_v128_load(i${M});
+          i${M} += 4;
+
+        const v128_t vbias = wasm_v128_load(w + ${CHANNEL_TILE});
+
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc${M}));
+
+        $if X86:
+          $for M in range(ROW_TILE):
+            vacc${M} = wasm_v128_bitselect(vmin, vacc${M}, wasm_f32x4_lt(vacc${M}, vmin));
+
+          $for M in range(ROW_TILE):
+            vacc${M} = wasm_v128_bitselect(vacc${M}, vmax, wasm_f32x4_le(vacc${M}, vmax));
+        $else:
+          $for M in range(ROW_TILE):
+            vacc${M} = wasm_f32x4_max(vacc${M}, vmin);
+
+          $for M in range(ROW_TILE):
+            vacc${M} = wasm_f32x4_min(vacc${M}, vmax);
+
+        $for M in range(ROW_TILE):
+          wasm_v128_store(o${M}, vacc${M});
+          o${M} += 4;
+
+        w += 4;
+      }
+    if XNN_UNLIKELY(c != 0) {
+      const v128_t vscale = wasm_v128_load(w);
+
+      $for M in range(ROW_TILE):
+        v128_t vacc${M} = wasm_v128_load(i${M});
+        i${M} = (const float*) ((uintptr_t) i${M} + c);
+
+      const v128_t vbias = wasm_v128_load(w + ${CHANNEL_TILE});
+
+      $for M in range(ROW_TILE):
+        vacc${M} = wasm_f32x4_add(vbias, wasm_f32x4_mul(vscale, vacc${M}));
+
+      $if X86:
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_v128_bitselect(vmin, vacc${M}, wasm_f32x4_lt(vacc${M}, vmin));
+
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_v128_bitselect(vacc${M}, vmax, wasm_f32x4_le(vacc${M}, vmax));
+      $else:
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_f32x4_max(vacc${M}, vmin);
+
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_f32x4_min(vacc${M}, vmax);
+
+      if (c & (2 * sizeof(float))) {
+        $for M in range(ROW_TILE):
+          *((double*) o${M}) = wasm_f64x2_extract_lane(vacc${M}, 0);
+
+        $for M in range(ROW_TILE):
+          vacc${M} = wasm_v32x4_shuffle(vacc${M}, vacc${M}, 2, 3, 2, 3);
+
+        $for M in range(ROW_TILE):
+          o${M} += 2;
+      }
+      if (c & (1 * sizeof(float))) {
+        $for M in range(ROW_TILE):
+          *o${M}++ = wasm_f32x4_extract_lane(vacc${M}, 0);
+      }
+    }
+    $for M in range(ROW_TILE):
+      i${M} = (const float*) ((uintptr_t) i${M} + input_increment);
+      o${M} = (float*) ((uintptr_t) o${M} + output_increment);
+      $if M % 2 == 1:
+        if XNN_UNPREDICTABLE(rows < ${ROW_TILE + M + 1}) {
+          i${M} = i${M-1};
+          o${M} = o${M-1};
+        }
+      $elif M != 0:
+        if XNN_UNPREDICTABLE(rows <= ${ROW_TILE + M}) {
+          i${M} = i${M-1};
+          o${M} = o${M-1};
+        }
+    rows = doz(rows, ${ROW_TILE});
+  } while (rows != 0);
+}
diff --git a/src/init.c b/src/init.c
index d932e33..0f5da5a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1969,11 +1969,19 @@
       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x8,
       .element_tile = 8,
     };
-    xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
-      .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__psimd_2x,
-      .channel_tile = 4,
-      .row_tile = 2,
-    };
+    if (is_wasm_x86) {
+      xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
+        .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x,
+        .channel_tile = 4,
+        .row_tile = 2,
+      };
+    } else {
+      xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
+        .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x,
+        .channel_tile = 4,
+        .row_tile = 2,
+      };
+    }
     #ifndef XNN_NO_NCHW_OPERATORS
       xnn_params.f32.spmm = (struct spmm_parameters) {
         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_16x1__psimd,
diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
index e227b54..39b4b6e 100644
--- a/src/xnnpack/vmulcaddc.h
+++ b/src/xnnpack/vmulcaddc.h
@@ -33,11 +33,17 @@
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neonfma_2x)
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__neonfma_2x)
 
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__sse_2x)
+
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__psimd_2x)
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__psimd_2x)
 
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__sse_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x)
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x)
 
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c1__wasm_2x)
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c2__wasm_2x)
diff --git a/test/f32-vmulcaddc-minmax.cc b/test/f32-vmulcaddc-minmax.cc
index 5f08664..24c663c 100644
--- a/test/f32-vmulcaddc-minmax.cc
+++ b/test/f32-vmulcaddc-minmax.cc
@@ -1249,6 +1249,574 @@
 #endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, channels_eq_4) {
+    VMulCAddCMicrokernelTester()
+      .channel_tile(4)
+      .channels(4)
+      .rows(2)
+      .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, channels_div_4) {
+    for (size_t channels = 8; channels < 40; channels += 4) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, channels_lt_4) {
+    for (size_t channels = 1; channels < 4; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, channels_gt_4) {
+    for (size_t channels = 5; channels < 8; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, rows_lt_2) {
+    for (size_t rows = 1; rows < 2; rows++) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, rows_div_2) {
+    for (size_t rows = 4; rows <= 8; rows += 2) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, rows_gt_2) {
+    for (size_t rows = 3; rows < 4; rows++) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, input_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .input_stride(23)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, output_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .output_stride(23)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, inplace) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .inplace(true)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, qmin) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .qmin(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_ARM_2X, qmax) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .qmax(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, channels_eq_8) {
+    VMulCAddCMicrokernelTester()
+      .channel_tile(8)
+      .channels(8)
+      .rows(2)
+      .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, channels_div_8) {
+    for (size_t channels = 16; channels < 80; channels += 8) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, channels_lt_8) {
+    for (size_t channels = 1; channels < 8; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, channels_gt_8) {
+    for (size_t channels = 9; channels < 16; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, rows_lt_2) {
+    for (size_t rows = 1; rows < 2; rows++) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, rows_div_2) {
+    for (size_t rows = 4; rows <= 8; rows += 2) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, rows_gt_2) {
+    for (size_t rows = 3; rows < 4; rows++) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, input_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .input_stride(43)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, output_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .output_stride(43)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, inplace) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .inplace(true)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, qmin) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .qmin(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_ARM_2X, qmax) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .qmax(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, channels_eq_4) {
+    VMulCAddCMicrokernelTester()
+      .channel_tile(4)
+      .channels(4)
+      .rows(2)
+      .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, channels_div_4) {
+    for (size_t channels = 8; channels < 40; channels += 4) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, channels_lt_4) {
+    for (size_t channels = 1; channels < 4; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, channels_gt_4) {
+    for (size_t channels = 5; channels < 8; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(4)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, rows_lt_2) {
+    for (size_t rows = 1; rows < 2; rows++) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, rows_div_2) {
+    for (size_t rows = 4; rows <= 8; rows += 2) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, rows_gt_2) {
+    for (size_t rows = 3; rows < 4; rows++) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, input_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .input_stride(23)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, output_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .output_stride(23)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, inplace) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .inplace(true)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, qmin) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .qmin(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C4__WASMSIMD_X86_2X, qmax) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 20; channels += 3) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(4)
+          .channels(channels)
+          .rows(rows)
+          .qmax(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, channels_eq_8) {
+    VMulCAddCMicrokernelTester()
+      .channel_tile(8)
+      .channels(8)
+      .rows(2)
+      .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, channels_div_8) {
+    for (size_t channels = 16; channels < 80; channels += 8) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, channels_lt_8) {
+    for (size_t channels = 1; channels < 8; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, channels_gt_8) {
+    for (size_t channels = 9; channels < 16; channels++) {
+      VMulCAddCMicrokernelTester()
+        .channel_tile(8)
+        .channels(channels)
+        .rows(2)
+        .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, rows_lt_2) {
+    for (size_t rows = 1; rows < 2; rows++) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, rows_div_2) {
+    for (size_t rows = 4; rows <= 8; rows += 2) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, rows_gt_2) {
+    for (size_t rows = 3; rows < 4; rows++) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, input_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .input_stride(43)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, output_stride) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .output_stride(43)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, inplace) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .inplace(true)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, qmin) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .qmin(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+
+  TEST(F32_VMULCADDC_C8__WASMSIMD_X86_2X, qmax) {
+    for (size_t rows = 1; rows <= 6; rows += 1) {
+      for (size_t channels = 1; channels <= 40; channels += 7) {
+        VMulCAddCMicrokernelTester()
+          .channel_tile(8)
+          .channels(channels)
+          .rows(rows)
+          .qmax(128)
+          .Test(xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMULCADDC_C1__WASM_2X, channels_eq_1) {
     VMulCAddCMicrokernelTester()
diff --git a/test/f32-vmulcaddc-minmax.yaml b/test/f32-vmulcaddc-minmax.yaml
index 93fa59c..ffc5b79 100644
--- a/test/f32-vmulcaddc-minmax.yaml
+++ b/test/f32-vmulcaddc-minmax.yaml
@@ -10,6 +10,10 @@
 - name: xnn_f32_vmulcaddc_ukernel_c8__sse_2x
 - name: xnn_f32_vmulcaddc_ukernel_c4__psimd_2x
 - name: xnn_f32_vmulcaddc_ukernel_c8__psimd_2x
+- name: xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_arm_2x
+- name: xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_arm_2x
+- name: xnn_f32_vmulcaddc_ukernel_c4__wasmsimd_x86_2x
+- name: xnn_f32_vmulcaddc_ukernel_c8__wasmsimd_x86_2x
 - name: xnn_f32_vmulcaddc_ukernel_c1__wasm_2x
 - name: xnn_f32_vmulcaddc_ukernel_c2__wasm_2x
 - name: xnn_f32_vmulcaddc_ukernel_c4__wasm_2x