F32 CLAMP micro-kernels in AVX and AVX512F implementations
PiperOrigin-RevId: 282845725
diff --git a/BUILD.bazel b/BUILD.bazel
index 4bcf91a..4e40b4f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -601,6 +601,7 @@
]
AVX_UKERNELS = [
+ "src/f32-clamp/avx.c",
"src/f32-dwconv/gen/up16x4-avx-acc2.c",
"src/f32-dwconv/gen/up16x4-avx.c",
"src/f32-dwconv/gen/up8x4-avx-acc2.c",
@@ -679,6 +680,7 @@
]
AVX512F_UKERNELS = [
+ "src/f32-clamp/avx512f.c",
"src/f32-dwconv/gen/up32x4-avx512f-acc2.c",
"src/f32-dwconv/gen/up32x4-avx512f.c",
"src/f32-dwconv/gen/up16x4-avx512f-acc2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4089f6..b3a81a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -681,6 +681,7 @@
src/f32-sigmoid/gen/sse41-p5-div-x16.c)
SET(XNNPACK_AVX_MICROKERNEL_SRCS
+ src/f32-clamp/avx.c
src/f32-dwconv/gen/up16x4-avx-acc2.c
src/f32-dwconv/gen/up16x4-avx.c
src/f32-dwconv/gen/up8x4-avx-acc2.c
@@ -756,6 +757,7 @@
src/math/extexp-avx2-p5.c)
SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
+ src/f32-clamp/avx512f.c
src/f32-dwconv/gen/up32x4-avx512f-acc2.c
src/f32-dwconv/gen/up32x4-avx512f.c
src/f32-dwconv/gen/up16x4-avx512f-acc2.c
diff --git a/src/f32-clamp/avx.c b/src/f32-clamp/avx.c
new file mode 100644
index 0000000..9f3414c
--- /dev/null
+++ b/src/f32-clamp/avx.c
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/clamp.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_clamp_ukernel__avx(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const __m256 voutput_max = _mm256_broadcast_ps((const __m128*) params->sse.max);
+ const __m256 voutput_min = _mm256_broadcast_ps((const __m128*) params->sse.min);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ const __m256 vx = _mm256_loadu_ps(x);
+ x += 8;
+
+ const __m256 vy = _mm256_min_ps(_mm256_max_ps(vx, voutput_min), voutput_max);
+
+ _mm256_storeu_ps(y, vy);
+ y += 8;
+ }
+ if (n != 0) {
+ assert(n >= 1);
+ assert(n <= 7);
+ __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+ const __m256 vx = _mm256_maskload_ps(x, vmask);
+
+ const __m256 vy = _mm256_min_ps(_mm256_max_ps(vx, voutput_min), voutput_max);
+
+ _mm256_maskstore_ps(y, vmask, vy);
+ }
+}
diff --git a/src/f32-clamp/avx512f.c b/src/f32-clamp/avx512f.c
new file mode 100644
index 0000000..d1df6e4
--- /dev/null
+++ b/src/f32-clamp/avx512f.c
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/clamp.h>
+
+
+void xnn_f32_clamp_ukernel__avx512f(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_output_params params[restrict static 1])
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const __m512 voutput_max = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+ const __m512 voutput_min = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+
+ for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+ const __m512 vx = _mm512_loadu_ps(x);
+ x += 16;
+
+ const __m512 vy = _mm512_min_ps(_mm512_max_ps(vx, voutput_min), voutput_max);
+
+ _mm512_storeu_ps(y, vy);
+ y += 16;
+ }
+ if (n != 0) {
+ assert(n >= 1);
+ assert(n <= 15);
+ // Prepare mask for valid 32-bit elements (depends on n).
+ n >>= 2 /* log2(sizeof(float)) */;
+ const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
+
+ const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
+
+ const __m512 vy = _mm512_min_ps(_mm512_max_ps(vx, voutput_min), voutput_max);
+
+ _mm512_mask_storeu_ps(y, vmask, vy);
+ }
+}
diff --git a/src/init.c b/src/init.c
index 4b9e0ce..e7ce5b3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -771,7 +771,13 @@
.pixel_tile = 1,
.channel_tile = 8,
};
- xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
+ if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
+ xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx512f;
+ } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
+ xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx;
+ } else {
+ xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
+ }
xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse;
xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_p5_div_x16;
xnn_params.f32.prelu = (struct prelu_parameters) {
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index 0cd59b4..7a9c67b 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -26,9 +26,11 @@
float* y, \
const union xnn_f32_output_params* params);
-DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__neon)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__sse)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx512f)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__scalar)
diff --git a/test/f32-clamp.cc b/test/f32-clamp.cc
index 6abcc53..9ee5c30 100644
--- a/test/f32-clamp.cc
+++ b/test/f32-clamp.cc
@@ -12,148 +12,6 @@
#include "clamp-microkernel-tester.h"
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
- TEST(F32_CLAMP__PSIMD, n_eq_4) {
- TEST_REQUIRES_PSIMD;
- ClampMicrokernelTester()
- .n(4)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
-
- TEST(F32_CLAMP__PSIMD, n_div_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 4; n < 256; n += 4) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_CLAMP__PSIMD, n_gt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 5; n < 8; n++) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_CLAMP__PSIMD, n_lt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n < 4; n++) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_CLAMP__PSIMD, inplace) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n < 64; n += 3) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .inplace(true)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_CLAMP__PSIMD, qmin) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n < 64; n += 5) {
- for (uint8_t qmin = 1; qmin < 255; qmin++) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .qmin(qmin)
- .qmax(255)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_CLAMP__PSIMD, qmax) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n < 64; n += 5) {
- for (uint8_t qmax = 1; qmax < 255; qmax++) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .qmin(0)
- .qmax(qmax)
- .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
- }
- }
- }
-#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-
-
-TEST(F32_CLAMP__SCALAR, n_eq_2) {
- ClampMicrokernelTester()
- .n(2)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-}
-
-TEST(F32_CLAMP__SCALAR, n_div_2) {
- for (size_t n = 4; n < 128; n += 2) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_CLAMP__SCALAR, n_gt_2) {
- for (size_t n = 3; n < 4; n++) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_CLAMP__SCALAR, n_lt_2) {
- for (size_t n = 1; n < 2; n++) {
- ClampMicrokernelTester()
- .n(n)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_CLAMP__SCALAR, inplace) {
- for (size_t n = 1; n < 32; n += 3) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .inplace(true)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_CLAMP__SCALAR, qmin) {
- for (size_t n = 1; n < 32; n += 3) {
- for (uint8_t qmin = 1; qmin < 255; qmin++) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .qmin(qmin)
- .qmax(255)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_CLAMP__SCALAR, qmax) {
- for (size_t n = 1; n < 32; n += 3) {
- for (uint8_t qmax = 1; qmax < 255; qmax++) {
- ClampMicrokernelTester()
- .iterations(1)
- .n(n)
- .qmin(0)
- .qmax(qmax)
- .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CLAMP__NEON, n_eq_4) {
TEST_REQUIRES_ARM_NEON;
@@ -303,3 +161,295 @@
}
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_CLAMP__AVX, n_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ ClampMicrokernelTester()
+ .n(8)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+
+ TEST(F32_CLAMP__AVX, n_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 8; n < 512; n += 8) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX, n_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 9; n < 16; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX, n_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 1; n < 8; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX, inplace) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 1; n < 128; n += 7) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX, qmin) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 1; n < 128; n += 7) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__AVX, qmax) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t n = 1; n < 128; n += 7) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__avx);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_CLAMP__AVX512F, n_eq_16) {
+ TEST_REQUIRES_X86_AVX512F;
+ ClampMicrokernelTester()
+ .n(16)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+
+ TEST(F32_CLAMP__AVX512F, n_div_16) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 16; n < 1024; n += 16) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX512F, n_gt_16) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 17; n < 32; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX512F, n_lt_16) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 1; n < 16; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX512F, inplace) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 1; n < 256; n += 15) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+
+ TEST(F32_CLAMP__AVX512F, qmin) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 1; n < 256; n += 15) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__AVX512F, qmax) {
+ TEST_REQUIRES_X86_AVX512F;
+ for (size_t n = 1; n < 256; n += 15) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__avx512f);
+ }
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+ TEST(F32_CLAMP__PSIMD, n_eq_4) {
+ TEST_REQUIRES_PSIMD;
+ ClampMicrokernelTester()
+ .n(4)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_CLAMP__PSIMD, n_div_4) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 4; n < 256; n += 4) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_CLAMP__PSIMD, n_gt_4) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 5; n < 8; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_CLAMP__PSIMD, n_lt_4) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 1; n < 4; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_CLAMP__PSIMD, inplace) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 1; n < 64; n += 3) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_CLAMP__PSIMD, qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 1; n < 64; n += 5) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_CLAMP__PSIMD, qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t n = 1; n < 64; n += 5) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+
+
+TEST(F32_CLAMP__SCALAR, n_eq_2) {
+ ClampMicrokernelTester()
+ .n(2)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_CLAMP__SCALAR, n_div_2) {
+ for (size_t n = 4; n < 128; n += 2) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_CLAMP__SCALAR, n_gt_2) {
+ for (size_t n = 3; n < 4; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_CLAMP__SCALAR, n_lt_2) {
+ for (size_t n = 1; n < 2; n++) {
+ ClampMicrokernelTester()
+ .n(n)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_CLAMP__SCALAR, inplace) {
+ for (size_t n = 1; n < 32; n += 3) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .inplace(true)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_CLAMP__SCALAR, qmin) {
+ for (size_t n = 1; n < 32; n += 3) {
+ for (uint8_t qmin = 1; qmin < 255; qmin++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(qmin)
+ .qmax(255)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_CLAMP__SCALAR, qmax) {
+ for (size_t n = 1; n < 32; n += 3) {
+ for (uint8_t qmax = 1; qmax < 255; qmax++) {
+ ClampMicrokernelTester()
+ .iterations(1)
+ .n(n)
+ .qmin(0)
+ .qmax(qmax)
+ .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+ }
+ }
+}