WAsm SIMD versions of F32 CLAMP microkernel

PiperOrigin-RevId: 320246893
diff --git a/BUILD.bazel b/BUILD.bazel
index 89df855..80cf46c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -505,6 +505,10 @@
 ]
 
 WASMSIMD_UKERNELS = [
+    "src/f32-clamp/gen/wasmsimd-arm-x4.c",
+    "src/f32-clamp/gen/wasmsimd-arm-x8.c",
+    "src/f32-clamp/gen/wasmsimd-x86-x4.c",
+    "src/f32-clamp/gen/wasmsimd-x86-x8.c",
     "src/f32-dwconv/gen/up4x25-wasmsimd.c",
     "src/f32-dwconv/gen/up4x4-wasmsimd.c",
     "src/f32-dwconv/gen/up4x9-wasmsimd.c",
diff --git a/scripts/generate-f32-clamp.sh b/scripts/generate-f32-clamp.sh
index 863458c..f0d2c35 100755
--- a/scripts/generate-f32-clamp.sh
+++ b/scripts/generate-f32-clamp.sh
@@ -15,6 +15,13 @@
 tools/xngen src/f32-clamp/scalar.c.in -D BATCH_TILE=2 -D WASM=1 -o src/f32-clamp/gen/wasm-x2.c
 tools/xngen src/f32-clamp/scalar.c.in -D BATCH_TILE=4 -D WASM=1 -o src/f32-clamp/gen/wasm-x4.c
 
+################################## WAsm SIMD ##################################
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=4 -D X86=0 -o src/f32-clamp/gen/wasmsimd-arm-x4.c
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=8 -D X86=0 -o src/f32-clamp/gen/wasmsimd-arm-x8.c
+
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=4 -D X86=1 -o src/f32-clamp/gen/wasmsimd-x86-x4.c
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=8 -D X86=1 -o src/f32-clamp/gen/wasmsimd-x86-x8.c
+
 ################################### ARM NEON ##################################
 tools/xngen src/f32-clamp/neon.c.in -D BATCH_TILE=4 -o src/f32-clamp/gen/neon-x4.c
 tools/xngen src/f32-clamp/neon.c.in -D BATCH_TILE=8 -o src/f32-clamp/gen/neon-x8.c
diff --git a/src/f32-clamp/gen/wasmsimd-arm-x4.c b/src/f32-clamp/gen/wasmsimd-arm-x4.c
new file mode 100644
index 0000000..7a58ccd
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-arm-x4.c
@@ -0,0 +1,55 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-clamp/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_arm_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vacc = wasm_v128_load(x);
+    x += 4;
+
+    vacc = wasm_f32x4_max(vacc, vy_min);
+    vacc = wasm_f32x4_min(vacc, vy_max);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vacc = wasm_v128_load(x);
+
+    vacc = wasm_f32x4_max(vacc, vy_min);
+    vacc = wasm_f32x4_min(vacc, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-arm-x8.c b/src/f32-clamp/gen/wasmsimd-arm-x8.c
new file mode 100644
index 0000000..86450d9
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-arm-x8.c
@@ -0,0 +1,70 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-clamp/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_arm_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    v128_t vacc0123 = wasm_v128_load(x);
+    v128_t vacc4567 = wasm_v128_load(x + 4);
+    x += 8;
+
+    vacc0123 = wasm_f32x4_max(vacc0123, vy_min);
+    vacc4567 = wasm_f32x4_max(vacc4567, vy_min);
+
+    vacc0123 = wasm_f32x4_min(vacc0123, vy_max);
+    vacc4567 = wasm_f32x4_min(vacc4567, vy_max);
+
+    wasm_v128_store(y, vacc0123);
+    wasm_v128_store(y + 4, vacc4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vacc = wasm_v128_load(x);
+    x += 4;
+
+    vacc = wasm_f32x4_max(vacc, vy_min);
+    vacc = wasm_f32x4_min(vacc, vy_max);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vacc = wasm_v128_load(x);
+
+    vacc = wasm_f32x4_max(vacc, vy_min);
+    vacc = wasm_f32x4_min(vacc, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-x86-x4.c b/src/f32-clamp/gen/wasmsimd-x86-x4.c
new file mode 100644
index 0000000..48d0486
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-x86-x4.c
@@ -0,0 +1,59 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-clamp/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_x86_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vacc = wasm_v128_load(x);
+    x += 4;
+
+    const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+    const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+    vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+    vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vacc = wasm_v128_load(x);
+
+    const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+    const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+    vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+    vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-x86-x8.c b/src/f32-clamp/gen/wasmsimd-x86-x8.c
new file mode 100644
index 0000000..d55dcc7
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-x86-x8.c
@@ -0,0 +1,79 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-clamp/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_x86_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    v128_t vacc0123 = wasm_v128_load(x);
+    v128_t vacc4567 = wasm_v128_load(x + 4);
+    x += 8;
+
+    const v128_t vmaskmin0123 = wasm_f32x4_lt(vacc0123, vy_min);
+    const v128_t vmaskmin4567 = wasm_f32x4_lt(vacc4567, vy_min);
+
+    const v128_t vmaskmax0123 = wasm_f32x4_le(vy_max, vacc0123);
+    vacc0123 = wasm_v128_bitselect(vy_min, vacc0123, vmaskmin0123);
+    const v128_t vmaskmax4567 = wasm_f32x4_le(vy_max, vacc4567);
+    vacc4567 = wasm_v128_bitselect(vy_min, vacc4567, vmaskmin4567);
+
+    vacc0123 = wasm_v128_bitselect(vy_max, vacc0123, vmaskmax0123);
+    vacc4567 = wasm_v128_bitselect(vy_max, vacc4567, vmaskmax4567);
+
+    wasm_v128_store(y, vacc0123);
+    wasm_v128_store(y + 4, vacc4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vacc = wasm_v128_load(x);
+    x += 4;
+
+    const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+    const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+    vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+    vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vacc = wasm_v128_load(x);
+
+    const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+    const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+    vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+    vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-clamp/wasmsimd.c.in b/src/f32-clamp/wasmsimd.c.in
new file mode 100644
index 0000000..3f11387
--- /dev/null
+++ b/src/f32-clamp/wasmsimd.c.in
@@ -0,0 +1,95 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_${"x86" if X86 else "arm"}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  $if BATCH_TILE > 4:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      v128_t vacc${ABC[0:4]} = wasm_v128_load(x);
+      $for N in range(4, BATCH_TILE, 4):
+        v128_t vacc${ABC[N:N+4]} = wasm_v128_load(x + ${N});
+      x += ${BATCH_TILE};
+
+      $if X86:
+        $for N in range(0, BATCH_TILE, 4):
+          const v128_t vmaskmin${ABC[N:N+4]} = wasm_f32x4_lt(vacc${ABC[N:N+4]}, vy_min);
+
+        $for N in range(0, BATCH_TILE, 4):
+          const v128_t vmaskmax${ABC[N:N+4]} = wasm_f32x4_le(vy_max, vacc${ABC[N:N+4]});
+          vacc${ABC[N:N+4]} = wasm_v128_bitselect(vy_min, vacc${ABC[N:N+4]}, vmaskmin${ABC[N:N+4]});
+
+        $for N in range(0, BATCH_TILE, 4):
+          vacc${ABC[N:N+4]} = wasm_v128_bitselect(vy_max, vacc${ABC[N:N+4]}, vmaskmax${ABC[N:N+4]});
+      $else:
+        $for N in range(0, BATCH_TILE, 4):
+          vacc${ABC[N:N+4]} = wasm_f32x4_max(vacc${ABC[N:N+4]}, vy_min);
+
+        $for N in range(0, BATCH_TILE, 4):
+          vacc${ABC[N:N+4]} = wasm_f32x4_min(vacc${ABC[N:N+4]}, vy_max);
+
+      wasm_v128_store(y, vacc${ABC[0:4]});
+      $for N in range(4, BATCH_TILE, 4):
+        wasm_v128_store(y + ${N}, vacc${ABC[N:N+4]});
+      y += ${BATCH_TILE};
+    }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vacc = wasm_v128_load(x);
+    x += 4;
+
+    $if X86:
+      const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+      const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+      vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+      vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+    $else:
+      vacc = wasm_f32x4_max(vacc, vy_min);
+      vacc = wasm_f32x4_min(vacc, vy_max);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vacc = wasm_v128_load(x);
+
+    $if X86:
+      const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+      const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+      vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+      vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+    $else:
+      vacc = wasm_f32x4_max(vacc, vy_min);
+      vacc = wasm_f32x4_min(vacc, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/init.c b/src/init.c
index d728c6d..65fddc0 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1814,10 +1814,11 @@
       .channel_tile = 8,
     };
     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
-    xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd_x8;
     if (is_wasm_x86) {
+      xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x86_x16;
     } else {
+      xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_arm_x8;
     }
     xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__psimd_x8;
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index 4954462..14c07fc 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -46,6 +46,10 @@
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx512f_x32)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd_x4)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd_x8)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_arm_x4)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_x86_x4)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_x86_x8)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x1)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x2)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x4)
diff --git a/test/f32-clamp.cc b/test/f32-clamp.cc
index 9308c6f..ee703b7 100644
--- a/test/f32-clamp.cc
+++ b/test/f32-clamp.cc
@@ -747,6 +747,270 @@
 #endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_eq_4) {
+    ClampMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_div_4) {
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_lt_4) {
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_gt_4) {
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, inplace) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, qmin) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X4, qmax) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_eq_8) {
+    ClampMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_ARM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_eq_4) {
+    ClampMicrokernelTester()
+      .batch_size(4)
+      .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_div_4) {
+    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_lt_4) {
+    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_gt_4) {
+    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, inplace) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, qmin) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X4, qmax) {
+    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_eq_8) {
+    ClampMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      ClampMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__WASMSIMD_X86_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .batch_size(batch_size)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+      }
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_CLAMP__WASM_X1, batch_eq_1) {
     ClampMicrokernelTester()
diff --git a/test/f32-clamp.yaml b/test/f32-clamp.yaml
index 39a1e21..c0578d7 100644
--- a/test/f32-clamp.yaml
+++ b/test/f32-clamp.yaml
@@ -12,6 +12,10 @@
 - name: xnn_f32_clamp_ukernel__avx512f_x32
 - name: xnn_f32_clamp_ukernel__psimd_x4
 - name: xnn_f32_clamp_ukernel__psimd_x8
+- name: xnn_f32_clamp_ukernel__wasmsimd_arm_x4
+- name: xnn_f32_clamp_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_clamp_ukernel__wasmsimd_x86_x4
+- name: xnn_f32_clamp_ukernel__wasmsimd_x86_x8
 - name: xnn_f32_clamp_ukernel__wasm_x1
 - name: xnn_f32_clamp_ukernel__wasm_x2
 - name: xnn_f32_clamp_ukernel__wasm_x4