F32 CLAMP micro-kernels in AVX and AVX512F implementations

PiperOrigin-RevId: 282845725
diff --git a/BUILD.bazel b/BUILD.bazel
index 4bcf91a..4e40b4f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -601,6 +601,7 @@
 ]
 
 AVX_UKERNELS = [
+    "src/f32-clamp/avx.c",
     "src/f32-dwconv/gen/up16x4-avx-acc2.c",
     "src/f32-dwconv/gen/up16x4-avx.c",
     "src/f32-dwconv/gen/up8x4-avx-acc2.c",
@@ -679,6 +680,7 @@
 ]
 
 AVX512F_UKERNELS = [
+    "src/f32-clamp/avx512f.c",
     "src/f32-dwconv/gen/up32x4-avx512f-acc2.c",
     "src/f32-dwconv/gen/up32x4-avx512f.c",
     "src/f32-dwconv/gen/up16x4-avx512f-acc2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4089f6..b3a81a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -681,6 +681,7 @@
   src/f32-sigmoid/gen/sse41-p5-div-x16.c)
 
 SET(XNNPACK_AVX_MICROKERNEL_SRCS
+  src/f32-clamp/avx.c
   src/f32-dwconv/gen/up16x4-avx-acc2.c
   src/f32-dwconv/gen/up16x4-avx.c
   src/f32-dwconv/gen/up8x4-avx-acc2.c
@@ -756,6 +757,7 @@
   src/math/extexp-avx2-p5.c)
 
 SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
+  src/f32-clamp/avx512f.c
   src/f32-dwconv/gen/up32x4-avx512f-acc2.c
   src/f32-dwconv/gen/up32x4-avx512f.c
   src/f32-dwconv/gen/up16x4-avx512f-acc2.c
diff --git a/src/f32-clamp/avx.c b/src/f32-clamp/avx.c
new file mode 100644
index 0000000..9f3414c
--- /dev/null
+++ b/src/f32-clamp/avx.c
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/clamp.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_clamp_ukernel__avx(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 voutput_max = _mm256_broadcast_ps((const __m128*) params->sse.max);
+  const __m256 voutput_min = _mm256_broadcast_ps((const __m128*) params->sse.min);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(x);
+    x += 8;
+
+    const __m256 vy = _mm256_min_ps(_mm256_max_ps(vx, voutput_min), voutput_max);
+
+    _mm256_storeu_ps(y, vy);
+    y += 8;
+  }
+  if (n != 0) {
+    assert(n >= 1);
+    assert(n <= 7);
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+
+    const __m256 vy = _mm256_min_ps(_mm256_max_ps(vx, voutput_min), voutput_max);
+
+    _mm256_maskstore_ps(y, vmask, vy);
+  }
+}
diff --git a/src/f32-clamp/avx512f.c b/src/f32-clamp/avx512f.c
new file mode 100644
index 0000000..d1df6e4
--- /dev/null
+++ b/src/f32-clamp/avx512f.c
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/clamp.h>
+
+
+void xnn_f32_clamp_ukernel__avx512f(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m512 voutput_max = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
+  const __m512 voutput_min = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m512 vx = _mm512_loadu_ps(x);
+    x += 16;
+
+    const __m512 vy = _mm512_min_ps(_mm512_max_ps(vx, voutput_min), voutput_max);
+
+    _mm512_storeu_ps(y, vy);
+    y += 16;
+  }
+  if (n != 0) {
+    assert(n >= 1);
+    assert(n <= 15);
+    // Prepare mask for valid 32-bit elements (depends on n).
+    n >>= 2 /* log2(sizeof(float)) */;
+    const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
+
+    const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
+
+    const __m512 vy = _mm512_min_ps(_mm512_max_ps(vx, voutput_min), voutput_max);
+
+    _mm512_mask_storeu_ps(y, vmask, vy);
+  }
+}
diff --git a/src/init.c b/src/init.c
index 4b9e0ce..e7ce5b3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -771,7 +771,13 @@
       .pixel_tile = 1,
       .channel_tile = 8,
     };
-    xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
+    if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
+      xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx512f;
+    } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
+      xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx;
+    } else {
+      xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
+    }
     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse;
     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_p5_div_x16;
     xnn_params.f32.prelu = (struct prelu_parameters) {
diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index 0cd59b4..7a9c67b 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h
@@ -26,9 +26,11 @@
       float* y,                                       \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__neon)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__sse)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx512f)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__scalar)
 
 
diff --git a/test/f32-clamp.cc b/test/f32-clamp.cc
index 6abcc53..9ee5c30 100644
--- a/test/f32-clamp.cc
+++ b/test/f32-clamp.cc
@@ -12,148 +12,6 @@
 #include "clamp-microkernel-tester.h"
 
 
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-  TEST(F32_CLAMP__PSIMD, n_eq_4) {
-    TEST_REQUIRES_PSIMD;
-    ClampMicrokernelTester()
-      .n(4)
-      .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-  }
-
-  TEST(F32_CLAMP__PSIMD, n_div_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 4; n < 256; n += 4) {
-      ClampMicrokernelTester()
-        .n(n)
-        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_CLAMP__PSIMD, n_gt_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 5; n < 8; n++) {
-      ClampMicrokernelTester()
-        .n(n)
-        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_CLAMP__PSIMD, n_lt_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n < 4; n++) {
-      ClampMicrokernelTester()
-        .n(n)
-        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_CLAMP__PSIMD, inplace) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n < 64; n += 3) {
-      ClampMicrokernelTester()
-        .iterations(1)
-        .n(n)
-        .inplace(true)
-        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_CLAMP__PSIMD, qmin) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n < 64; n += 5) {
-      for (uint8_t qmin = 1; qmin < 255; qmin++) {
-        ClampMicrokernelTester()
-          .iterations(1)
-          .n(n)
-          .qmin(qmin)
-          .qmax(255)
-          .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_CLAMP__PSIMD, qmax) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n < 64; n += 5) {
-      for (uint8_t qmax = 1; qmax < 255; qmax++) {
-        ClampMicrokernelTester()
-          .iterations(1)
-          .n(n)
-          .qmin(0)
-          .qmax(qmax)
-          .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-
-
-TEST(F32_CLAMP__SCALAR, n_eq_2) {
-  ClampMicrokernelTester()
-    .n(2)
-    .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-}
-
-TEST(F32_CLAMP__SCALAR, n_div_2) {
-  for (size_t n = 4; n < 128; n += 2) {
-    ClampMicrokernelTester()
-      .n(n)
-      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_CLAMP__SCALAR, n_gt_2) {
-  for (size_t n = 3; n < 4; n++) {
-    ClampMicrokernelTester()
-      .n(n)
-      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_CLAMP__SCALAR, n_lt_2) {
-  for (size_t n = 1; n < 2; n++) {
-    ClampMicrokernelTester()
-      .n(n)
-      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_CLAMP__SCALAR, inplace) {
-  for (size_t n = 1; n < 32; n += 3) {
-    ClampMicrokernelTester()
-      .iterations(1)
-      .n(n)
-      .inplace(true)
-      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_CLAMP__SCALAR, qmin) {
-  for (size_t n = 1; n < 32; n += 3) {
-    for (uint8_t qmin = 1; qmin < 255; qmin++) {
-      ClampMicrokernelTester()
-        .iterations(1)
-        .n(n)
-        .qmin(qmin)
-        .qmax(255)
-        .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_CLAMP__SCALAR, qmax) {
-  for (size_t n = 1; n < 32; n += 3) {
-    for (uint8_t qmax = 1; qmax < 255; qmax++) {
-      ClampMicrokernelTester()
-        .iterations(1)
-        .n(n)
-        .qmin(0)
-        .qmax(qmax)
-        .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_CLAMP__NEON, n_eq_4) {
     TEST_REQUIRES_ARM_NEON;
@@ -303,3 +161,295 @@
     }
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_CLAMP__AVX, n_eq_8) {
+    TEST_REQUIRES_X86_AVX;
+    ClampMicrokernelTester()
+      .n(8)
+      .Test(xnn_f32_clamp_ukernel__avx);
+  }
+
+  TEST(F32_CLAMP__AVX, n_div_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 8; n < 512; n += 8) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX, n_gt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 9; n < 16; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX, n_lt_8) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 1; n < 8; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX, inplace) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 1; n < 128; n += 7) {
+      ClampMicrokernelTester()
+        .iterations(1)
+        .n(n)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__avx);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX, qmin) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 1; n < 128; n += 7) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__avx);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__AVX, qmax) {
+    TEST_REQUIRES_X86_AVX;
+    for (size_t n = 1; n < 128; n += 7) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__avx);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_CLAMP__AVX512F, n_eq_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    ClampMicrokernelTester()
+      .n(16)
+      .Test(xnn_f32_clamp_ukernel__avx512f);
+  }
+
+  TEST(F32_CLAMP__AVX512F, n_div_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 16; n < 1024; n += 16) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx512f);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX512F, n_gt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 17; n < 32; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx512f);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX512F, n_lt_16) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 1; n < 16; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__avx512f);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX512F, inplace) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 1; n < 256; n += 15) {
+      ClampMicrokernelTester()
+        .iterations(1)
+        .n(n)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__avx512f);
+    }
+  }
+
+  TEST(F32_CLAMP__AVX512F, qmin) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 1; n < 256; n += 15) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__avx512f);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__AVX512F, qmax) {
+    TEST_REQUIRES_X86_AVX512F;
+    for (size_t n = 1; n < 256; n += 15) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__avx512f);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+  TEST(F32_CLAMP__PSIMD, n_eq_4) {
+    TEST_REQUIRES_PSIMD;
+    ClampMicrokernelTester()
+      .n(4)
+      .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_CLAMP__PSIMD, n_div_4) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 4; n < 256; n += 4) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_CLAMP__PSIMD, n_gt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 5; n < 8; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_CLAMP__PSIMD, n_lt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 1; n < 4; n++) {
+      ClampMicrokernelTester()
+        .n(n)
+        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_CLAMP__PSIMD, inplace) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 1; n < 64; n += 3) {
+      ClampMicrokernelTester()
+        .iterations(1)
+        .n(n)
+        .inplace(true)
+        .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_CLAMP__PSIMD, qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 1; n < 64; n += 5) {
+      for (uint8_t qmin = 1; qmin < 255; qmin++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(qmin)
+          .qmax(255)
+          .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_CLAMP__PSIMD, qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t n = 1; n < 64; n += 5) {
+      for (uint8_t qmax = 1; qmax < 255; qmax++) {
+        ClampMicrokernelTester()
+          .iterations(1)
+          .n(n)
+          .qmin(0)
+          .qmax(qmax)
+          .Test(xnn_f32_clamp_ukernel__psimd, ClampMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+
+
+TEST(F32_CLAMP__SCALAR, n_eq_2) {
+  ClampMicrokernelTester()
+    .n(2)
+    .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_CLAMP__SCALAR, n_div_2) {
+  for (size_t n = 4; n < 128; n += 2) {
+    ClampMicrokernelTester()
+      .n(n)
+      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_CLAMP__SCALAR, n_gt_2) {
+  for (size_t n = 3; n < 4; n++) {
+    ClampMicrokernelTester()
+      .n(n)
+      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_CLAMP__SCALAR, n_lt_2) {
+  for (size_t n = 1; n < 2; n++) {
+    ClampMicrokernelTester()
+      .n(n)
+      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_CLAMP__SCALAR, inplace) {
+  for (size_t n = 1; n < 32; n += 3) {
+    ClampMicrokernelTester()
+      .iterations(1)
+      .n(n)
+      .inplace(true)
+      .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_CLAMP__SCALAR, qmin) {
+  for (size_t n = 1; n < 32; n += 3) {
+    for (uint8_t qmin = 1; qmin < 255; qmin++) {
+      ClampMicrokernelTester()
+        .iterations(1)
+        .n(n)
+        .qmin(qmin)
+        .qmax(255)
+        .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_CLAMP__SCALAR, qmax) {
+  for (size_t n = 1; n < 32; n += 3) {
+    for (uint8_t qmax = 1; qmax < 255; qmax++) {
+      ClampMicrokernelTester()
+        .iterations(1)
+        .n(n)
+        .qmin(0)
+        .qmax(qmax)
+        .Test(xnn_f32_clamp_ukernel__scalar, ClampMicrokernelTester::Variant::Scalar);
+    }
+  }
+}