WAsm SIMD versions of F32 CLAMP microkernel
PiperOrigin-RevId: 320246893
diff --git a/BUILD.bazel b/BUILD.bazel
index 89df855..80cf46c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -505,6 +505,10 @@
]
WASMSIMD_UKERNELS = [
+ "src/f32-clamp/gen/wasmsimd-arm-x4.c",
+ "src/f32-clamp/gen/wasmsimd-arm-x8.c",
+ "src/f32-clamp/gen/wasmsimd-x86-x4.c",
+ "src/f32-clamp/gen/wasmsimd-x86-x8.c",
"src/f32-dwconv/gen/up4x25-wasmsimd.c",
"src/f32-dwconv/gen/up4x4-wasmsimd.c",
"src/f32-dwconv/gen/up4x9-wasmsimd.c",
diff --git a/scripts/generate-f32-clamp.sh b/scripts/generate-f32-clamp.sh
index 863458c..f0d2c35 100755
--- a/scripts/generate-f32-clamp.sh
+++ b/scripts/generate-f32-clamp.sh
@@ -15,6 +15,13 @@
tools/xngen src/f32-clamp/scalar.c.in -D BATCH_TILE=2 -D WASM=1 -o src/f32-clamp/gen/wasm-x2.c
tools/xngen src/f32-clamp/scalar.c.in -D BATCH_TILE=4 -D WASM=1 -o src/f32-clamp/gen/wasm-x4.c
+################################## WAsm SIMD ##################################
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=4 -D X86=0 -o src/f32-clamp/gen/wasmsimd-arm-x4.c
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=8 -D X86=0 -o src/f32-clamp/gen/wasmsimd-arm-x8.c
+
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=4 -D X86=1 -o src/f32-clamp/gen/wasmsimd-x86-x4.c
+tools/xngen src/f32-clamp/wasmsimd.c.in -D BATCH_TILE=8 -D X86=1 -o src/f32-clamp/gen/wasmsimd-x86-x8.c
+
################################### ARM NEON ##################################
tools/xngen src/f32-clamp/neon.c.in -D BATCH_TILE=4 -o src/f32-clamp/gen/neon-x4.c
tools/xngen src/f32-clamp/neon.c.in -D BATCH_TILE=8 -o src/f32-clamp/gen/neon-x8.c
diff --git a/src/f32-clamp/gen/wasmsimd-arm-x4.c b/src/f32-clamp/gen/wasmsimd-arm-x4.c
new file mode 100644
index 0000000..7a58ccd
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-arm-x4.c
@@ -0,0 +1,55 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-clamp/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_arm_x4(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
+
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vacc = wasm_v128_load(x);
+ x += 4;
+
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vacc = wasm_v128_load(x);
+
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-arm-x8.c b/src/f32-clamp/gen/wasmsimd-arm-x8.c
new file mode 100644
index 0000000..86450d9
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-arm-x8.c
@@ -0,0 +1,70 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-clamp/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_arm_x8(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ v128_t vacc0123 = wasm_v128_load(x);
+ v128_t vacc4567 = wasm_v128_load(x + 4);
+ x += 8;
+
+ vacc0123 = wasm_f32x4_max(vacc0123, vy_min);
+ vacc4567 = wasm_f32x4_max(vacc4567, vy_min);
+
+ vacc0123 = wasm_f32x4_min(vacc0123, vy_max);
+ vacc4567 = wasm_f32x4_min(vacc4567, vy_max);
+
+ wasm_v128_store(y, vacc0123);
+ wasm_v128_store(y + 4, vacc4567);
+ y += 8;
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vacc = wasm_v128_load(x);
+ x += 4;
+
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vacc = wasm_v128_load(x);
+
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-x86-x4.c b/src/f32-clamp/gen/wasmsimd-x86-x4.c
new file mode 100644
index 0000000..48d0486
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-x86-x4.c
@@ -0,0 +1,59 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-clamp/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_x86_x4(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
+
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vacc = wasm_v128_load(x);
+ x += 4;
+
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vacc = wasm_v128_load(x);
+
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-clamp/gen/wasmsimd-x86-x8.c b/src/f32-clamp/gen/wasmsimd-x86-x8.c
new file mode 100644
index 0000000..d55dcc7
--- /dev/null
+++ b/src/f32-clamp/gen/wasmsimd-x86-x8.c
@@ -0,0 +1,79 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-clamp/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_x86_x8(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ v128_t vacc0123 = wasm_v128_load(x);
+ v128_t vacc4567 = wasm_v128_load(x + 4);
+ x += 8;
+
+ const v128_t vmaskmin0123 = wasm_f32x4_lt(vacc0123, vy_min);
+ const v128_t vmaskmin4567 = wasm_f32x4_lt(vacc4567, vy_min);
+
+ const v128_t vmaskmax0123 = wasm_f32x4_le(vy_max, vacc0123);
+ vacc0123 = wasm_v128_bitselect(vy_min, vacc0123, vmaskmin0123);
+ const v128_t vmaskmax4567 = wasm_f32x4_le(vy_max, vacc4567);
+ vacc4567 = wasm_v128_bitselect(vy_min, vacc4567, vmaskmin4567);
+
+ vacc0123 = wasm_v128_bitselect(vy_max, vacc0123, vmaskmax0123);
+ vacc4567 = wasm_v128_bitselect(vy_max, vacc4567, vmaskmax4567);
+
+ wasm_v128_store(y, vacc0123);
+ wasm_v128_store(y + 4, vacc4567);
+ y += 8;
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vacc = wasm_v128_load(x);
+ x += 4;
+
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vacc = wasm_v128_load(x);
+
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-clamp/wasmsimd.c.in b/src/f32-clamp/wasmsimd.c.in
new file mode 100644
index 0000000..3f11387
--- /dev/null
+++ b/src/f32-clamp/wasmsimd.c.in
@@ -0,0 +1,95 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/common.h>
+
+
+void xnn_f32_clamp_ukernel__wasmsimd_${"x86" if X86 else "arm"}_x${BATCH_TILE}(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
+ const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
+
+ $if BATCH_TILE > 4:
+ for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+ v128_t vacc${ABC[0:4]} = wasm_v128_load(x);
+ $for N in range(4, BATCH_TILE, 4):
+ v128_t vacc${ABC[N:N+4]} = wasm_v128_load(x + ${N});
+ x += ${BATCH_TILE};
+
+ $if X86:
+ $for N in range(0, BATCH_TILE, 4):
+ const v128_t vmaskmin${ABC[N:N+4]} = wasm_f32x4_lt(vacc${ABC[N:N+4]}, vy_min);
+
+ $for N in range(0, BATCH_TILE, 4):
+ const v128_t vmaskmax${ABC[N:N+4]} = wasm_f32x4_le(vy_max, vacc${ABC[N:N+4]});
+ vacc${ABC[N:N+4]} = wasm_v128_bitselect(vy_min, vacc${ABC[N:N+4]}, vmaskmin${ABC[N:N+4]});
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_v128_bitselect(vy_max, vacc${ABC[N:N+4]}, vmaskmax${ABC[N:N+4]});
+ $else:
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_f32x4_max(vacc${ABC[N:N+4]}, vy_min);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_f32x4_min(vacc${ABC[N:N+4]}, vy_max);
+
+ wasm_v128_store(y, vacc${ABC[0:4]});
+ $for N in range(4, BATCH_TILE, 4):
+ wasm_v128_store(y + ${N}, vacc${ABC[N:N+4]});
+ y += ${BATCH_TILE};
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vacc = wasm_v128_load(x);
+ x += 4;
+
+ $if X86:
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+ $else:
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vacc = wasm_v128_load(x);
+
+ $if X86:
+ const v128_t vmaskmin = wasm_f32x4_lt(vacc, vy_min);
+ const v128_t vmaskmax = wasm_f32x4_le(vy_max, vacc);
+ vacc = wasm_v128_bitselect(vy_min, vacc, vmaskmin);
+ vacc = wasm_v128_bitselect(vy_max, vacc, vmaskmax);
+ $else:
+ vacc = wasm_f32x4_max(vacc, vy_min);
+ vacc = wasm_f32x4_min(vacc, vy_max);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/init.c b/src/init.c
index d728c6d..65fddc0 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1814,10 +1814,11 @@
.channel_tile = 8,
};
xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
- xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd_x8;
if (is_wasm_x86) {
+ xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x86_x16;
} else {
+ xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_arm_x8;
}
xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__psimd_x8;
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index 4954462..14c07fc 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -46,6 +46,10 @@
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx512f_x32)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd_x4)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd_x8)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_arm_x4)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_x86_x4)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasmsimd_x86_x8)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x1)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x2)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm_x4)
diff --git a/test/f32-clamp.cc b/test/f32-clamp.cc
index 9308c6f..ee703b7 100644
--- a/test/f32-clamp.cc
+++ b/test/f32-clamp.cc
@@ -747,6 +747,270 @@
#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_eq_4) {
+ ClampMicrokernelTester()
+ .batch_size(4)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_div_4) {
+ for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_lt_4) {
+ for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, batch_gt_4) {
+ for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, inplace) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, qmin) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X4, qmax) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x4);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_eq_8) {
+ ClampMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, inplace) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, qmin) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_ARM_X8, qmax) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_arm_x8);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_eq_4) {
+ ClampMicrokernelTester()
+ .batch_size(4)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_div_4) {
+ for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_lt_4) {
+ for (size_t batch_size = 1; batch_size < 4; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, batch_gt_4) {
+ for (size_t batch_size = 5; batch_size < 8; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, inplace) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, qmin) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X4, qmax) {
+ for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x4);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_eq_8) {
+ ClampMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, inplace) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, qmin) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__WASMSIMD_X86_X8, qmax) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__wasmsimd_x86_x8);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD
+
+
#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
TEST(F32_CLAMP__WASM_X1, batch_eq_1) {
ClampMicrokernelTester()
diff --git a/test/f32-clamp.yaml b/test/f32-clamp.yaml
index 39a1e21..c0578d7 100644
--- a/test/f32-clamp.yaml
+++ b/test/f32-clamp.yaml
@@ -12,6 +12,10 @@
- name: xnn_f32_clamp_ukernel__avx512f_x32
- name: xnn_f32_clamp_ukernel__psimd_x4
- name: xnn_f32_clamp_ukernel__psimd_x8
+- name: xnn_f32_clamp_ukernel__wasmsimd_arm_x4
+- name: xnn_f32_clamp_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_clamp_ukernel__wasmsimd_x86_x4
+- name: xnn_f32_clamp_ukernel__wasmsimd_x86_x8
- name: xnn_f32_clamp_ukernel__wasm_x1
- name: xnn_f32_clamp_ukernel__wasm_x2
- name: xnn_f32_clamp_ukernel__wasm_x4