Reoptimize HSWISH microkernels

- Compute Hard Swish as (x * (1/6)) * min(max(x + 3), 0), 6) instead of x *
  min(max(x * (1/6) + 0.5), 0, 1) to expose more ILP
- Use integer minimum/maximum instructions in NEON and WAsm SIMD
- Remove NEON FMA microkernels
- Remove PSIMD microkernels
- Combine x86 and ARM versions of WAsm SIMD microkernels

Performance:
- Cortex-A73: 10.1 GB/s -> 11.8 GB/s
- Cortex-A73 (WAsm SIMD): 8.4 GB/s -> 8.9 GB/s
- Skylake-X (WAsm SIMD): 24 GB/s -> 51 GB/s
PiperOrigin-RevId: 320719817
diff --git a/BUILD.bazel b/BUILD.bazel
index 8bae09e..70ecd13 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -657,12 +657,9 @@
     "src/f32-igemm/gen/6x8s4-minmax-wasmsimd-x86.c",
     "src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c",
     "src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c",
-    "src/f32-hswish/gen/wasmsimd-arm-x4.c",
-    "src/f32-hswish/gen/wasmsimd-arm-x8.c",
-    "src/f32-hswish/gen/wasmsimd-arm-x16.c",
-    "src/f32-hswish/gen/wasmsimd-x86-x4.c",
-    "src/f32-hswish/gen/wasmsimd-x86-x8.c",
-    "src/f32-hswish/gen/wasmsimd-x86-x16.c",
+    "src/f32-hswish/gen/wasmsimd-x4.c",
+    "src/f32-hswish/gen/wasmsimd-x8.c",
+    "src/f32-hswish/gen/wasmsimd-x16.c",
     "src/f32-maxpool/9p8x-minmax-wasmsimd-arm-c4.c",
     "src/f32-maxpool/9p8x-minmax-wasmsimd-x86-c4.c",
     "src/f32-prelu/gen/wasmsimd-bitselect-2x4.c",
@@ -821,8 +818,6 @@
     "src/f32-gemm/gen-inc/6x8inc-minmax-psimd-loadsplat.c",
     "src/f32-gemm/gen-inc/6x8inc-minmax-psimd-splat.c",
     "src/f32-gemm/gen-inc/6x8s4inc-minmax-psimd.c",
-    "src/f32-hswish/gen/psimd-x4.c",
-    "src/f32-hswish/gen/psimd-x8.c",
     "src/f32-ibilinear/gen/psimd-c4.c",
     "src/f32-ibilinear/gen/psimd-c8.c",
     "src/f32-igemm/gen/1x8-minmax-psimd-loadsplat.c",
@@ -1198,8 +1193,6 @@
     "src/f32-gemm/gen-inc/4x8s4inc-minmax-neonfma.c",
     "src/f32-gemm/gen-inc/6x8s4inc-minmax-neonfma.c",
     "src/f32-gemm/gen-inc/8x8s4inc-minmax-neonfma.c",
-    "src/f32-hswish/gen/neonfma-x4.c",
-    "src/f32-hswish/gen/neonfma-x8.c",
     "src/f32-ppmm/gen/4x8-minmax-neonfma.c",
     "src/f32-ppmm/gen/8x8-minmax-neonfma.c",
     "src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea7463f..12dcf21 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,8 +558,6 @@
   src/f32-gemm/gen-inc/6x8inc-minmax-psimd-loadsplat.c
   src/f32-gemm/gen-inc/6x8inc-minmax-psimd-splat.c
   src/f32-gemm/gen-inc/6x8s4inc-minmax-psimd.c
-  src/f32-hswish/gen/psimd-x4.c
-  src/f32-hswish/gen/psimd-x8.c
   src/f32-ibilinear/gen/psimd-c4.c
   src/f32-ibilinear/gen/psimd-c8.c
   src/f32-igemm/gen/1x8-minmax-psimd-loadsplat.c
@@ -931,8 +929,6 @@
   src/f32-gemm/gen-inc/4x8s4inc-minmax-neonfma.c
   src/f32-gemm/gen-inc/6x8s4inc-minmax-neonfma.c
   src/f32-gemm/gen-inc/8x8s4inc-minmax-neonfma.c
-  src/f32-hswish/gen/neonfma-x4.c
-  src/f32-hswish/gen/neonfma-x8.c
   src/f32-ppmm/gen/4x8-minmax-neonfma.c
   src/f32-ppmm/gen/8x8-minmax-neonfma.c
   src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c
diff --git a/bench/f32-hswish.cc b/bench/f32-hswish.cc
index feef5da..636b849 100644
--- a/bench/f32-hswish.cc
+++ b/bench/f32-hswish.cc
@@ -66,15 +66,6 @@
     ->RangeMultiplier(10)
     ->Range(1000, 1000000)
     ->UseRealTime();
-
-  BENCHMARK_CAPTURE(f32_hswish, neonfma_x4, xnn_f32_hswish_ukernel__neonfma_x4, benchmark::utils::CheckNEONFMA)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, neonfma_x8, xnn_f32_hswish_ukernel__neonfma_x8, benchmark::utils::CheckNEONFMA)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -115,40 +106,16 @@
     ->UseRealTime();
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-  BENCHMARK_CAPTURE(f32_hswish, psimd_x4, xnn_f32_hswish_ukernel__psimd_x4)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, psimd_x8, xnn_f32_hswish_ukernel__psimd_x8)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
 #if XNN_ARCH_WASMSIMD
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x4, xnn_f32_hswish_ukernel__wasmsimd_arm_x4)
+  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x4, xnn_f32_hswish_ukernel__wasmsimd_x4)
     ->RangeMultiplier(10)
     ->Range(1000, 1000000)
     ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x8, xnn_f32_hswish_ukernel__wasmsimd_arm_x8)
+  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x8, xnn_f32_hswish_ukernel__wasmsimd_x8)
     ->RangeMultiplier(10)
     ->Range(1000, 1000000)
     ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x16, xnn_f32_hswish_ukernel__wasmsimd_arm_x16)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x4, xnn_f32_hswish_ukernel__wasmsimd_x86_x4)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x8, xnn_f32_hswish_ukernel__wasmsimd_x86_x8)
-    ->RangeMultiplier(10)
-    ->Range(1000, 1000000)
-    ->UseRealTime();
-  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x16, xnn_f32_hswish_ukernel__wasmsimd_x86_x16)
+  BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x16, xnn_f32_hswish_ukernel__wasmsimd_x16)
     ->RangeMultiplier(10)
     ->Range(1000, 1000000)
     ->UseRealTime();
diff --git a/scripts/generate-f32-hswish.sh b/scripts/generate-f32-hswish.sh
index 3155c0d..4c9b51b 100755
--- a/scripts/generate-f32-hswish.sh
+++ b/scripts/generate-f32-hswish.sh
@@ -16,24 +16,13 @@
 tools/xngen src/f32-hswish/scalar.c.in -D BATCH_TILE=4 -D WASM=1 -o src/f32-hswish/gen/wasm-x4.c
 
 ################################## WAsm SIMD ##################################
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4  -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x4.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8  -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x8.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x16.c
-
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4  -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x4.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8  -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x8.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x16.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4  -o src/f32-hswish/gen/wasmsimd-x4.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8  -o src/f32-hswish/gen/wasmsimd-x8.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -o src/f32-hswish/gen/wasmsimd-x16.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -D FMA=0 -o src/f32-hswish/gen/neon-x4.c
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -D FMA=0 -o src/f32-hswish/gen/neon-x8.c
-
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -D FMA=1 -o src/f32-hswish/gen/neonfma-x4.c
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -D FMA=1 -o src/f32-hswish/gen/neonfma-x8.c
-
-#################################### PSIMD ####################################
-tools/xngen src/f32-hswish/psimd.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/psimd-x4.c
-tools/xngen src/f32-hswish/psimd.c.in -D BATCH_TILE=8 -o src/f32-hswish/gen/psimd-x8.c
+tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/neon-x4.c
+tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -o src/f32-hswish/gen/neon-x8.c
 
 ################################# x86 128-bit #################################
 tools/xngen src/f32-hswish/sse.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/sse-x4.c
diff --git a/src/f32-hswish/gen/neon-x4.c b/src/f32-hswish/gen/neon-x4.c
index 5f9a1fe..818308d 100644
--- a/src/f32-hswish/gen/neon-x4.c
+++ b/src/f32-hswish/gen/neon-x4.c
@@ -25,37 +25,34 @@
   assert(n % sizeof(float) == 0);
 
   const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const float32x4_t vthree = vld1q_dup_f32(&params->scalar.three);
+  const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(&params->scalar.six));
+  const int32x4_t vzero = vdupq_n_s32(0);
 
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-
-    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-
-    vacc0123 = vminq_f32(vacc0123, vone);
-
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-
-    vst1q_f32(y, vacc0123); y += 4;
+    float32x4_t vx = vld1q_f32(x); x += 4;
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
+    vst1q_f32(y, vacc); y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    float32x4_t vx = vld1q_f32(x);
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
 
-    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    float32x2_t vacc_lo = vget_low_f32(vacc);
     if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vacc01); y += 2;
-      vacc01 = vget_high_f32(vacc0123);
+      vst1_f32(y, vacc_lo); y += 2;
+      vacc_lo = vget_high_f32(vacc);
     }
     if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vacc01, 0);
+      vst1_lane_f32(y, vacc_lo, 0);
     }
   }
 }
diff --git a/src/f32-hswish/gen/neon-x8.c b/src/f32-hswish/gen/neon-x8.c
index 825808c..f949f8b 100644
--- a/src/f32-hswish/gen/neon-x8.c
+++ b/src/f32-hswish/gen/neon-x8.c
@@ -25,22 +25,24 @@
   assert(n % sizeof(float) == 0);
 
   const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const float32x4_t vthree = vld1q_dup_f32(&params->scalar.three);
+  const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(&params->scalar.six));
+  const int32x4_t vzero = vdupq_n_s32(0);
 
   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    float32x4_t vx4567 = vld1q_f32(x); x += 4;
 
-    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-    float32x4_t vacc4567 = vmlaq_f32(vhalf, vx4567, vsixth);
+    float32x4_t vacc0123 = vaddq_f32(vx0123, vthree);
+    vx0123 = vmulq_f32(vx0123, vsixth);
+    float32x4_t vacc4567 = vaddq_f32(vx4567, vthree);
+    vx4567 = vmulq_f32(vx4567, vsixth);
 
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc4567 = vmaxq_f32(vacc4567, vzero);
+    vacc0123 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc0123), vzero));
+    vacc4567 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc4567), vzero));
 
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc4567 = vminq_f32(vacc4567, vone);
+    vacc0123 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc0123), vsix));
+    vacc4567 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc4567), vsix));
 
     vacc0123 = vmulq_f32(vacc0123, vx0123);
     vacc4567 = vmulq_f32(vacc4567, vx4567);
@@ -49,27 +51,29 @@
     vst1q_f32(y, vacc4567); y += 4;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-    vst1q_f32(y, vacc0123); y += 4;
+    float32x4_t vx = vld1q_f32(x); x += 4;
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
+    vst1q_f32(y, vacc); y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    float32x4_t vx = vld1q_f32(x);
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
 
-    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    float32x2_t vacc_lo = vget_low_f32(vacc);
     if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vacc01); y += 2;
-      vacc01 = vget_high_f32(vacc0123);
+      vst1_f32(y, vacc_lo); y += 2;
+      vacc_lo = vget_high_f32(vacc);
     }
     if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vacc01, 0);
+      vst1_lane_f32(y, vacc_lo, 0);
     }
   }
 }
diff --git a/src/f32-hswish/gen/neonfma-x4.c b/src/f32-hswish/gen/neonfma-x4.c
deleted file mode 100644
index b31510e..0000000
--- a/src/f32-hswish/gen/neonfma-x4.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neonfma_x4(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
-
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-
-    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-
-    vacc0123 = vminq_f32(vacc0123, vone);
-
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-
-    vst1q_f32(y, vacc0123); y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-
-    float32x2_t vacc01 = vget_low_f32(vacc0123);
-    if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vacc01); y += 2;
-      vacc01 = vget_high_f32(vacc0123);
-    }
-    if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vacc01, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/neonfma-x8.c b/src/f32-hswish/gen/neonfma-x8.c
deleted file mode 100644
index 6371884..0000000
--- a/src/f32-hswish/gen/neonfma-x8.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/neon.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neonfma_x8(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
-
-    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-    float32x4_t vacc4567 = vfmaq_f32(vhalf, vx4567, vsixth);
-
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc4567 = vmaxq_f32(vacc4567, vzero);
-
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc4567 = vminq_f32(vacc4567, vone);
-
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-    vacc4567 = vmulq_f32(vacc4567, vx4567);
-
-    vst1q_f32(y, vacc0123); y += 4;
-    vst1q_f32(y, vacc4567); y += 4;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-    vst1q_f32(y, vacc0123); y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
-
-    float32x2_t vacc01 = vget_low_f32(vacc0123);
-    if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vacc01); y += 2;
-      vacc01 = vget_high_f32(vacc0123);
-    }
-    if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vacc01, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/psimd-x4.c b/src/f32-hswish/gen/psimd-x4.c
deleted file mode 100644
index 302a9dc..0000000
--- a/src/f32-hswish/gen/psimd-x4.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/psimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x4(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
-  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
-  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
-  const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    x += 4;
-
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
-    psimd_store_f32(y, vacc0123);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
-    if (n & (2 * sizeof(float))) {
-      psimd_store2_f32(y, vacc0123);
-      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      psimd_store1_f32(y, vacc0123);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/psimd-x8.c b/src/f32-hswish/gen/psimd-x8.c
deleted file mode 100644
index bb0135d..0000000
--- a/src/f32-hswish/gen/psimd-x8.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/psimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x8(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
-  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
-  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
-  const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
-    x += 8;
-
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-    psimd_f32 vacc4567 = psimd_qfma_f32(vhalf, vx4567, vsixth);
-
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-    vacc4567 = psimd_max_f32(vacc4567, vzero);
-
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-    vacc4567 = psimd_min_f32(vacc4567, vone);
-
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-    vacc4567 = psimd_mul_f32(vacc4567, vx4567);
-
-    psimd_store_f32(y, vacc0123);
-    psimd_store_f32(y + 4, vacc4567);
-    y += 8;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    x += 4;
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-    psimd_store_f32(y, vacc0123);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
-    if (n & (2 * sizeof(float))) {
-      psimd_store2_f32(y, vacc0123);
-      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      psimd_store1_f32(y, vacc0123);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/scalar-x1.c b/src/f32-hswish/gen/scalar-x1.c
index fae0788..c93e22b 100644
--- a/src/f32-hswish/gen/scalar-x1.c
+++ b/src/f32-hswish/gen/scalar-x1.c
@@ -24,17 +24,19 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= sizeof(float); n -= sizeof(float)) {
-    const float vx = *x++;
-    float vacc = vx * vsixth + vhalf;
-    vacc = math_max_f32(vacc, 0.0f);
-    vacc = math_min_f32(vacc, vone);
-    vacc = vacc * vx;
+    float vx = *x++;
+    float vacc = vx + vthree;
+    vx *= vsixth;
+    vacc = math_max_f32(vacc, vzero);
+    vacc = math_min_f32(vacc, vsix);
+    vacc *= vx;
     *y++ = vacc;
   }
 }
diff --git a/src/f32-hswish/gen/scalar-x2.c b/src/f32-hswish/gen/scalar-x2.c
index 0b879ea..d84310e 100644
--- a/src/f32-hswish/gen/scalar-x2.c
+++ b/src/f32-hswish/gen/scalar-x2.c
@@ -24,24 +24,27 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
-    const float vx0 = x[0];
-    const float vx1 = x[1];
+    float vx0 = x[0];
+    float vx1 = x[1];
     x += 2;
 
-    float vacc0 = vx0 * vsixth + vhalf;
-    float vacc1 = vx1 * vsixth + vhalf;
+    float vacc0 = vx0 + vthree;
+    vx0 *= vsixth;
+    float vacc1 = vx1 + vthree;
+    vx1 *= vsixth;
 
-    vacc0 = math_max_f32(vacc0, 0.0f);
-    vacc1 = math_max_f32(vacc1, 0.0f);
+    vacc0 = math_max_f32(vacc0, vzero);
+    vacc1 = math_max_f32(vacc1, vzero);
 
-    vacc0 = math_min_f32(vacc0, vone);
-    vacc1 = math_min_f32(vacc1, vone);
+    vacc0 = math_min_f32(vacc0, vsix);
+    vacc1 = math_min_f32(vacc1, vsix);
 
     vacc0 *= vx0;
     vacc1 *= vx1;
@@ -51,11 +54,12 @@
     y += 2;
   }
   if XNN_UNLIKELY(n != 0) {
-    const float vx = *x;
-    float vacc = vx * vsixth + vhalf;
-    vacc = math_max_f32(vacc, 0.0f);
-    vacc = math_min_f32(vacc, vone);
-    vacc = vacc * vx;
+    float vx = *x;
+    float vacc = vx + vthree;
+    vx *= vsixth;
+    vacc = math_max_f32(vacc, vzero);
+    vacc = math_min_f32(vacc, vsix);
+    vacc *= vx;
     *y = vacc;
   }
 }
diff --git a/src/f32-hswish/gen/scalar-x4.c b/src/f32-hswish/gen/scalar-x4.c
index 23f77c0..6ef7dfc 100644
--- a/src/f32-hswish/gen/scalar-x4.c
+++ b/src/f32-hswish/gen/scalar-x4.c
@@ -24,32 +24,37 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float vx0 = x[0];
-    const float vx1 = x[1];
-    const float vx2 = x[2];
-    const float vx3 = x[3];
+    float vx0 = x[0];
+    float vx1 = x[1];
+    float vx2 = x[2];
+    float vx3 = x[3];
     x += 4;
 
-    float vacc0 = vx0 * vsixth + vhalf;
-    float vacc1 = vx1 * vsixth + vhalf;
-    float vacc2 = vx2 * vsixth + vhalf;
-    float vacc3 = vx3 * vsixth + vhalf;
+    float vacc0 = vx0 + vthree;
+    vx0 *= vsixth;
+    float vacc1 = vx1 + vthree;
+    vx1 *= vsixth;
+    float vacc2 = vx2 + vthree;
+    vx2 *= vsixth;
+    float vacc3 = vx3 + vthree;
+    vx3 *= vsixth;
 
-    vacc0 = math_max_f32(vacc0, 0.0f);
-    vacc1 = math_max_f32(vacc1, 0.0f);
-    vacc2 = math_max_f32(vacc2, 0.0f);
-    vacc3 = math_max_f32(vacc3, 0.0f);
+    vacc0 = math_max_f32(vacc0, vzero);
+    vacc1 = math_max_f32(vacc1, vzero);
+    vacc2 = math_max_f32(vacc2, vzero);
+    vacc3 = math_max_f32(vacc3, vzero);
 
-    vacc0 = math_min_f32(vacc0, vone);
-    vacc1 = math_min_f32(vacc1, vone);
-    vacc2 = math_min_f32(vacc2, vone);
-    vacc3 = math_min_f32(vacc3, vone);
+    vacc0 = math_min_f32(vacc0, vsix);
+    vacc1 = math_min_f32(vacc1, vsix);
+    vacc2 = math_min_f32(vacc2, vsix);
+    vacc3 = math_min_f32(vacc3, vsix);
 
     vacc0 *= vx0;
     vacc1 *= vx1;
@@ -64,11 +69,12 @@
   }
   if XNN_UNLIKELY(n != 0) {
     do {
-      const float vx = *x++;
-      float vacc = vx * vsixth + vhalf;
-      vacc = math_max_f32(vacc, 0.0f);
-      vacc = math_min_f32(vacc, vone);
-      vacc = vacc * vx;
+      float vx = *x++;
+      float vacc = vx + vthree;
+      vx *= vsixth;
+      vacc = math_max_f32(vacc, vzero);
+      vacc = math_min_f32(vacc, vsix);
+      vacc *= vx;
       *y++ = vacc;
       n -= sizeof(float);
     } while (n != 0);
diff --git a/src/f32-hswish/gen/wasm-x1.c b/src/f32-hswish/gen/wasm-x1.c
index 9cb44c2..e3c520c 100644
--- a/src/f32-hswish/gen/wasm-x1.c
+++ b/src/f32-hswish/gen/wasm-x1.c
@@ -24,17 +24,19 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= sizeof(float); n -= sizeof(float)) {
-    const float vx = *x++;
-    float vacc = vx * vsixth + vhalf;
-    vacc = __builtin_wasm_max_f32(vacc, 0.0f);
-    vacc = __builtin_wasm_min_f32(vacc, vone);
-    vacc = vacc * vx;
+    float vx = *x++;
+    float vacc = vx + vthree;
+    vx *= vsixth;
+    vacc = __builtin_wasm_max_f32(vacc, vzero);
+    vacc = __builtin_wasm_min_f32(vacc, vsix);
+    vacc *= vx;
     *y++ = vacc;
   }
 }
diff --git a/src/f32-hswish/gen/wasm-x2.c b/src/f32-hswish/gen/wasm-x2.c
index 531dece..ed8bebb 100644
--- a/src/f32-hswish/gen/wasm-x2.c
+++ b/src/f32-hswish/gen/wasm-x2.c
@@ -24,24 +24,27 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
-    const float vx0 = x[0];
-    const float vx1 = x[1];
+    float vx0 = x[0];
+    float vx1 = x[1];
     x += 2;
 
-    float vacc0 = vx0 * vsixth + vhalf;
-    float vacc1 = vx1 * vsixth + vhalf;
+    float vacc0 = vx0 + vthree;
+    vx0 *= vsixth;
+    float vacc1 = vx1 + vthree;
+    vx1 *= vsixth;
 
-    vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
-    vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
+    vacc0 = __builtin_wasm_max_f32(vacc0, vzero);
+    vacc1 = __builtin_wasm_max_f32(vacc1, vzero);
 
-    vacc0 = __builtin_wasm_min_f32(vacc0, vone);
-    vacc1 = __builtin_wasm_min_f32(vacc1, vone);
+    vacc0 = __builtin_wasm_min_f32(vacc0, vsix);
+    vacc1 = __builtin_wasm_min_f32(vacc1, vsix);
 
     vacc0 *= vx0;
     vacc1 *= vx1;
@@ -51,11 +54,12 @@
     y += 2;
   }
   if XNN_UNLIKELY(n != 0) {
-    const float vx = *x;
-    float vacc = vx * vsixth + vhalf;
-    vacc = __builtin_wasm_max_f32(vacc, 0.0f);
-    vacc = __builtin_wasm_min_f32(vacc, vone);
-    vacc = vacc * vx;
+    float vx = *x;
+    float vacc = vx + vthree;
+    vx *= vsixth;
+    vacc = __builtin_wasm_max_f32(vacc, vzero);
+    vacc = __builtin_wasm_min_f32(vacc, vsix);
+    vacc *= vx;
     *y = vacc;
   }
 }
diff --git a/src/f32-hswish/gen/wasm-x4.c b/src/f32-hswish/gen/wasm-x4.c
index d31ea04..696d055 100644
--- a/src/f32-hswish/gen/wasm-x4.c
+++ b/src/f32-hswish/gen/wasm-x4.c
@@ -24,32 +24,37 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float vx0 = x[0];
-    const float vx1 = x[1];
-    const float vx2 = x[2];
-    const float vx3 = x[3];
+    float vx0 = x[0];
+    float vx1 = x[1];
+    float vx2 = x[2];
+    float vx3 = x[3];
     x += 4;
 
-    float vacc0 = vx0 * vsixth + vhalf;
-    float vacc1 = vx1 * vsixth + vhalf;
-    float vacc2 = vx2 * vsixth + vhalf;
-    float vacc3 = vx3 * vsixth + vhalf;
+    float vacc0 = vx0 + vthree;
+    vx0 *= vsixth;
+    float vacc1 = vx1 + vthree;
+    vx1 *= vsixth;
+    float vacc2 = vx2 + vthree;
+    vx2 *= vsixth;
+    float vacc3 = vx3 + vthree;
+    vx3 *= vsixth;
 
-    vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
-    vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
-    vacc2 = __builtin_wasm_max_f32(vacc2, 0.0f);
-    vacc3 = __builtin_wasm_max_f32(vacc3, 0.0f);
+    vacc0 = __builtin_wasm_max_f32(vacc0, vzero);
+    vacc1 = __builtin_wasm_max_f32(vacc1, vzero);
+    vacc2 = __builtin_wasm_max_f32(vacc2, vzero);
+    vacc3 = __builtin_wasm_max_f32(vacc3, vzero);
 
-    vacc0 = __builtin_wasm_min_f32(vacc0, vone);
-    vacc1 = __builtin_wasm_min_f32(vacc1, vone);
-    vacc2 = __builtin_wasm_min_f32(vacc2, vone);
-    vacc3 = __builtin_wasm_min_f32(vacc3, vone);
+    vacc0 = __builtin_wasm_min_f32(vacc0, vsix);
+    vacc1 = __builtin_wasm_min_f32(vacc1, vsix);
+    vacc2 = __builtin_wasm_min_f32(vacc2, vsix);
+    vacc3 = __builtin_wasm_min_f32(vacc3, vsix);
 
     vacc0 *= vx0;
     vacc1 *= vx1;
@@ -64,11 +69,12 @@
   }
   if XNN_UNLIKELY(n != 0) {
     do {
-      const float vx = *x++;
-      float vacc = vx * vsixth + vhalf;
-      vacc = __builtin_wasm_max_f32(vacc, 0.0f);
-      vacc = __builtin_wasm_min_f32(vacc, vone);
-      vacc = vacc * vx;
+      float vx = *x++;
+      float vacc = vx + vthree;
+      vx *= vsixth;
+      vacc = __builtin_wasm_max_f32(vacc, vzero);
+      vacc = __builtin_wasm_min_f32(vacc, vsix);
+      vacc *= vx;
       *y++ = vacc;
       n -= sizeof(float);
     } while (n != 0);
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x16.c b/src/f32-hswish/gen/wasmsimd-arm-x16.c
deleted file mode 100644
index 1280245..0000000
--- a/src/f32-hswish/gen/wasmsimd-arm-x16.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/wasmsimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x16(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
-  const v128_t vzero = wasm_f32x4_splat(0.0f);
-
-  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
-    const v128_t vx0123 = wasm_v128_load(x);
-    const v128_t vx4567 = wasm_v128_load(x + 4);
-    const v128_t vx89AB = wasm_v128_load(x + 8);
-    const v128_t vxCDEF = wasm_v128_load(x + 12);
-    x += 16;
-
-    v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
-    v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-    v128_t vacc89AB = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx89AB, vsixth));
-    v128_t vaccCDEF = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vxCDEF, vsixth));
-
-    vacc0123 = wasm_f32x4_max(vacc0123, vzero);
-    vacc4567 = wasm_f32x4_max(vacc4567, vzero);
-    vacc89AB = wasm_f32x4_max(vacc89AB, vzero);
-    vaccCDEF = wasm_f32x4_max(vaccCDEF, vzero);
-
-    vacc0123 = wasm_f32x4_min(vacc0123, vone);
-    vacc4567 = wasm_f32x4_min(vacc4567, vone);
-    vacc89AB = wasm_f32x4_min(vacc89AB, vone);
-    vaccCDEF = wasm_f32x4_min(vaccCDEF, vone);
-
-    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
-    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-    vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
-    vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
-
-    wasm_v128_store(y, vacc0123);
-    wasm_v128_store(y + 4, vacc4567);
-    wasm_v128_store(y + 8, vacc89AB);
-    wasm_v128_store(y + 12, vaccCDEF);
-    y += 16;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-
-    wasm_v128_store(y, vacc);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-
-    if (n & (2 * sizeof(float))) {
-      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
-      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      *y = wasm_f32x4_extract_lane(vacc, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x8.c b/src/f32-hswish/gen/wasmsimd-arm-x8.c
deleted file mode 100644
index 7cebce8..0000000
--- a/src/f32-hswish/gen/wasmsimd-arm-x8.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/wasmsimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x8(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
-  const v128_t vzero = wasm_f32x4_splat(0.0f);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const v128_t vx0123 = wasm_v128_load(x);
-    const v128_t vx4567 = wasm_v128_load(x + 4);
-    x += 8;
-
-    v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
-    v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-
-    vacc0123 = wasm_f32x4_max(vacc0123, vzero);
-    vacc4567 = wasm_f32x4_max(vacc4567, vzero);
-
-    vacc0123 = wasm_f32x4_min(vacc0123, vone);
-    vacc4567 = wasm_f32x4_min(vacc4567, vone);
-
-    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
-    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-
-    wasm_v128_store(y, vacc0123);
-    wasm_v128_store(y + 4, vacc4567);
-    y += 8;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-
-    wasm_v128_store(y, vacc);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-
-    if (n & (2 * sizeof(float))) {
-      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
-      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      *y = wasm_f32x4_extract_lane(vacc, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x16.c b/src/f32-hswish/gen/wasmsimd-x16.c
new file mode 100644
index 0000000..dcbcd77
--- /dev/null
+++ b/src/f32-hswish/gen/wasmsimd-x16.c
@@ -0,0 +1,100 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/hswish.h>
+
+
+void xnn_f32_hswish_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
+  const v128_t vthree = wasm_v32x4_load_splat(&params->scalar.three);
+  const v128_t vsix = wasm_v32x4_load_splat(&params->scalar.six);
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    v128_t vx0123 = wasm_v128_load(x);
+    v128_t vx4567 = wasm_v128_load(x + 4);
+    v128_t vx89AB = wasm_v128_load(x + 8);
+    v128_t vxCDEF = wasm_v128_load(x + 12);
+    x += 16;
+
+    v128_t vacc0123 = wasm_f32x4_add(vx0123, vthree);
+    vx0123 = wasm_f32x4_mul(vx0123, vsixth);
+    v128_t vacc4567 = wasm_f32x4_add(vx4567, vthree);
+    vx4567 = wasm_f32x4_mul(vx4567, vsixth);
+    v128_t vacc89AB = wasm_f32x4_add(vx89AB, vthree);
+    vx89AB = wasm_f32x4_mul(vx89AB, vsixth);
+    v128_t vaccCDEF = wasm_f32x4_add(vxCDEF, vthree);
+    vxCDEF = wasm_f32x4_mul(vxCDEF, vsixth);
+
+    vacc0123 = wasm_i32x4_max(vacc0123, vzero);
+    vacc4567 = wasm_i32x4_max(vacc4567, vzero);
+    vacc89AB = wasm_i32x4_max(vacc89AB, vzero);
+    vaccCDEF = wasm_i32x4_max(vaccCDEF, vzero);
+
+    vacc0123 = wasm_i32x4_min(vacc0123, vsix);
+    vacc4567 = wasm_i32x4_min(vacc4567, vsix);
+    vacc89AB = wasm_i32x4_min(vacc89AB, vsix);
+    vaccCDEF = wasm_i32x4_min(vaccCDEF, vsix);
+
+    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
+    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
+    vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
+    vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
+
+    wasm_v128_store(y, vacc0123);
+    wasm_v128_store(y + 4, vacc4567);
+    wasm_v128_store(y + 8, vacc89AB);
+    wasm_v128_store(y + 12, vaccCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vx = wasm_v128_load(x);
+    x += 4;
+
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vx = wasm_v128_load(x);
+
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x4.c b/src/f32-hswish/gen/wasmsimd-x4.c
similarity index 67%
rename from src/f32-hswish/gen/wasmsimd-arm-x4.c
rename to src/f32-hswish/gen/wasmsimd-x4.c
index bc9ba3c..0e63426 100644
--- a/src/f32-hswish/gen/wasmsimd-arm-x4.c
+++ b/src/f32-hswish/gen/wasmsimd-x4.c
@@ -15,7 +15,7 @@
 #include <xnnpack/hswish.h>
 
 
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x4(
+void xnn_f32_hswish_ukernel__wasmsimd_x4(
     size_t n,
     const float* x,
     float* y,
@@ -25,28 +25,30 @@
   assert(n % sizeof(float) == 0);
 
   const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
+  const v128_t vthree = wasm_v32x4_load_splat(&params->scalar.three);
+  const v128_t vsix = wasm_v32x4_load_splat(&params->scalar.six);
   const v128_t vzero = wasm_f32x4_splat(0.0f);
 
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
+    v128_t vx = wasm_v128_load(x);
     x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
 
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
     vacc = wasm_f32x4_mul(vacc, vx);
 
     wasm_v128_store(y, vacc);
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
+    v128_t vx = wasm_v128_load(x);
 
-    vacc = wasm_f32x4_max(vacc, vzero);
-    vacc = wasm_f32x4_min(vacc, vone);
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
     vacc = wasm_f32x4_mul(vacc, vx);
 
     if (n & (2 * sizeof(float))) {
diff --git a/src/f32-hswish/gen/wasmsimd-x8.c b/src/f32-hswish/gen/wasmsimd-x8.c
new file mode 100644
index 0000000..a078759
--- /dev/null
+++ b/src/f32-hswish/gen/wasmsimd-x8.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/hswish.h>
+
+
+void xnn_f32_hswish_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
+  const v128_t vthree = wasm_v32x4_load_splat(&params->scalar.three);
+  const v128_t vsix = wasm_v32x4_load_splat(&params->scalar.six);
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    v128_t vx0123 = wasm_v128_load(x);
+    v128_t vx4567 = wasm_v128_load(x + 4);
+    x += 8;
+
+    v128_t vacc0123 = wasm_f32x4_add(vx0123, vthree);
+    vx0123 = wasm_f32x4_mul(vx0123, vsixth);
+    v128_t vacc4567 = wasm_f32x4_add(vx4567, vthree);
+    vx4567 = wasm_f32x4_mul(vx4567, vsixth);
+
+    vacc0123 = wasm_i32x4_max(vacc0123, vzero);
+    vacc4567 = wasm_i32x4_max(vacc4567, vzero);
+
+    vacc0123 = wasm_i32x4_min(vacc0123, vsix);
+    vacc4567 = wasm_i32x4_min(vacc4567, vsix);
+
+    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
+    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
+
+    wasm_v128_store(y, vacc0123);
+    wasm_v128_store(y + 4, vacc4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    v128_t vx = wasm_v128_load(x);
+    x += 4;
+
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
+
+    wasm_v128_store(y, vacc);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    v128_t vx = wasm_v128_load(x);
+
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vacc, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x16.c b/src/f32-hswish/gen/wasmsimd-x86-x16.c
deleted file mode 100644
index 441a62d..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x16.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/wasmsimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x16(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
-  const v128_t vzero = wasm_f32x4_splat(0.0f);
-
-  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
-    const v128_t vx0123 = wasm_v128_load(x);
-    const v128_t vx4567 = wasm_v128_load(x + 4);
-    const v128_t vx89AB = wasm_v128_load(x + 8);
-    const v128_t vxCDEF = wasm_v128_load(x + 12);
-    x += 16;
-
-    v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
-    v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-    v128_t vacc89AB = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx89AB, vsixth));
-    v128_t vaccCDEF = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vxCDEF, vsixth));
-
-    const v128_t vmasklt0123 = wasm_f32x4_lt(vacc0123, vzero);
-    vacc0123 = wasm_v128_andnot(vacc0123, vmasklt0123);
-    const v128_t vmasklt4567 = wasm_f32x4_lt(vacc4567, vzero);
-    vacc4567 = wasm_v128_andnot(vacc4567, vmasklt4567);
-    const v128_t vmasklt89AB = wasm_f32x4_lt(vacc89AB, vzero);
-    vacc89AB = wasm_v128_andnot(vacc89AB, vmasklt89AB);
-    const v128_t vmaskltCDEF = wasm_f32x4_lt(vaccCDEF, vzero);
-    vaccCDEF = wasm_v128_andnot(vaccCDEF, vmaskltCDEF);
-
-    const v128_t vmaskge0123 = wasm_f32x4_ge(vacc0123, vone);
-    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
-    const v128_t vmaskge4567 = wasm_f32x4_ge(vacc4567, vone);
-    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-    const v128_t vmaskge89AB = wasm_f32x4_ge(vacc89AB, vone);
-    vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
-    const v128_t vmaskgeCDEF = wasm_f32x4_ge(vaccCDEF, vone);
-    vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
-
-    vacc0123 = wasm_v128_bitselect(vx0123, vacc0123, vmaskge0123);
-    vacc4567 = wasm_v128_bitselect(vx4567, vacc4567, vmaskge4567);
-    vacc89AB = wasm_v128_bitselect(vx89AB, vacc89AB, vmaskge89AB);
-    vaccCDEF = wasm_v128_bitselect(vxCDEF, vaccCDEF, vmaskgeCDEF);
-
-    wasm_v128_store(y, vacc0123);
-    wasm_v128_store(y + 4, vacc4567);
-    wasm_v128_store(y + 8, vacc89AB);
-    wasm_v128_store(y + 12, vaccCDEF);
-    y += 16;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    wasm_v128_store(y, vacc);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    if (n & (2 * sizeof(float))) {
-      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
-      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      *y = wasm_f32x4_extract_lane(vacc, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x4.c b/src/f32-hswish/gen/wasmsimd-x86-x4.c
deleted file mode 100644
index 335d787..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x4.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/wasmsimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x4(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
-  const v128_t vzero = wasm_f32x4_splat(0.0f);
-
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    wasm_v128_store(y, vacc);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    if (n & (2 * sizeof(float))) {
-      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
-      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      *y = wasm_f32x4_extract_lane(vacc, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x8.c b/src/f32-hswish/gen/wasmsimd-x86-x8.c
deleted file mode 100644
index b3b6762..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x8.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-hswish/wasmsimd.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x8(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
-  const v128_t vzero = wasm_f32x4_splat(0.0f);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const v128_t vx0123 = wasm_v128_load(x);
-    const v128_t vx4567 = wasm_v128_load(x + 4);
-    x += 8;
-
-    v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
-    v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-
-    const v128_t vmasklt0123 = wasm_f32x4_lt(vacc0123, vzero);
-    vacc0123 = wasm_v128_andnot(vacc0123, vmasklt0123);
-    const v128_t vmasklt4567 = wasm_f32x4_lt(vacc4567, vzero);
-    vacc4567 = wasm_v128_andnot(vacc4567, vmasklt4567);
-
-    const v128_t vmaskge0123 = wasm_f32x4_ge(vacc0123, vone);
-    vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
-    const v128_t vmaskge4567 = wasm_f32x4_ge(vacc4567, vone);
-    vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-
-    vacc0123 = wasm_v128_bitselect(vx0123, vacc0123, vmaskge0123);
-    vacc4567 = wasm_v128_bitselect(vx4567, vacc4567, vmaskge4567);
-
-    wasm_v128_store(y, vacc0123);
-    wasm_v128_store(y + 4, vacc4567);
-    y += 8;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    wasm_v128_store(y, vacc);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
-    const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-    vacc = wasm_v128_andnot(vacc, vmasklt);
-    const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-    vacc = wasm_f32x4_mul(vacc, vx);
-    vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
-    if (n & (2 * sizeof(float))) {
-      *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
-      vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      *y = wasm_f32x4_extract_lane(vacc, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/neon.c.in b/src/f32-hswish/neon.c.in
index f8af0f7..61386b2 100644
--- a/src/f32-hswish/neon.c.in
+++ b/src/f32-hswish/neon.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/hswish.h>
 
 
-void xnn_f32_hswish_ukernel__${"neonfma" if FMA else "neon"}_x${BATCH_TILE}(
+void xnn_f32_hswish_ukernel__neon_x${BATCH_TILE}(
     size_t n,
     const float* x,
     float* y,
@@ -24,61 +24,55 @@
   assert(n % sizeof(float) == 0);
 
   const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const float32x4_t vthree = vld1q_dup_f32(&params->scalar.three);
+  const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(&params->scalar.six));
+  const int32x4_t vzero = vdupq_n_s32(0);
 
-  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
-    $for N in range(0, BATCH_TILE, 4):
-      const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
-
-    $for N in range(0, BATCH_TILE, 4):
-      $if FMA:
-        float32x4_t vacc${ABC[N:N+4]} = vfmaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
-      $else:
-        float32x4_t vacc${ABC[N:N+4]} = vmlaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = vmaxq_f32(vacc${ABC[N:N+4]}, vzero);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = vminq_f32(vacc${ABC[N:N+4]}, vone);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4;
-  }
   $if BATCH_TILE > 4:
-    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-      const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-      $if FMA:
-        float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-      $else:
-        float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-      vacc0123 = vmaxq_f32(vacc0123, vzero);
-      vacc0123 = vminq_f32(vacc0123, vone);
-      vacc0123 = vmulq_f32(vacc0123, vx0123);
-      vst1q_f32(y, vacc0123); y += 4;
-    }
-  if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-    $if FMA:
-      float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-    $else:
-      float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-    vacc0123 = vmaxq_f32(vacc0123, vzero);
-    vacc0123 = vminq_f32(vacc0123, vone);
-    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
 
-    float32x2_t vacc01 = vget_low_f32(vacc0123);
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vacc${ABC[N:N+4]} = vaddq_f32(vx${ABC[N:N+4]}, vthree);
+        vx${ABC[N:N+4]} = vmulq_f32(vx${ABC[N:N+4]}, vsixth);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc${ABC[N:N+4]}), vzero));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc${ABC[N:N+4]}), vsix));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4;
+    }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    float32x4_t vx = vld1q_f32(x); x += 4;
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
+    vst1q_f32(y, vacc); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    float32x4_t vx = vld1q_f32(x);
+    float32x4_t vacc = vaddq_f32(vx, vthree);
+    vx = vmulq_f32(vx, vsixth);
+    vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+    vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+    vacc = vmulq_f32(vacc, vx);
+
+    float32x2_t vacc_lo = vget_low_f32(vacc);
     if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vacc01); y += 2;
-      vacc01 = vget_high_f32(vacc0123);
+      vst1_f32(y, vacc_lo); y += 2;
+      vacc_lo = vget_high_f32(vacc);
     }
     if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vacc01, 0);
+      vst1_lane_f32(y, vacc_lo, 0);
     }
   }
 }
diff --git a/src/f32-hswish/psimd.c.in b/src/f32-hswish/psimd.c.in
deleted file mode 100644
index 2f71c29..0000000
--- a/src/f32-hswish/psimd.c.in
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert BATCH_TILE % 4 == 0
-$assert BATCH_TILE >= 4
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x${BATCH_TILE}(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
-  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
-  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
-  const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
-  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
-    const psimd_f32 vx${ABC[0:4]} = psimd_load_f32(x);
-    $for N in range(4, BATCH_TILE, 4):
-      const psimd_f32 vx${ABC[N:N+4]} = psimd_load_f32(x + ${N});
-    x += ${BATCH_TILE};
-
-    $for N in range(0, BATCH_TILE, 4):
-      psimd_f32 vacc${ABC[N:N+4]} = psimd_qfma_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = psimd_max_f32(vacc${ABC[N:N+4]}, vzero);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = psimd_min_f32(vacc${ABC[N:N+4]}, vone);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vacc${ABC[N:N+4]} = psimd_mul_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
-
-    psimd_store_f32(y, vacc${ABC[0:4]});
-    $for N in range(4, BATCH_TILE, 4):
-      psimd_store_f32(y + ${N}, vacc${ABC[N:N+4]});
-    y += ${BATCH_TILE};
-  }
-  $if BATCH_TILE > 4:
-    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-      const psimd_f32 vx0123 = psimd_load_f32(x);
-      x += 4;
-      psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-      vacc0123 = psimd_max_f32(vacc0123, vzero);
-      vacc0123 = psimd_min_f32(vacc0123, vone);
-      vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-      psimd_store_f32(y, vacc0123);
-      y += 4;
-    }
-  if XNN_UNLIKELY(n != 0) {
-    const psimd_f32 vx0123 = psimd_load_f32(x);
-    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-    vacc0123 = psimd_max_f32(vacc0123, vzero);
-    vacc0123 = psimd_min_f32(vacc0123, vone);
-    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
-    if (n & (2 * sizeof(float))) {
-      psimd_store2_f32(y, vacc0123);
-      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
-      y += 2;
-    }
-    if (n & (1 * sizeof(float))) {
-      psimd_store1_f32(y, vacc0123);
-    }
-  }
-}
diff --git a/src/f32-hswish/scalar.c.in b/src/f32-hswish/scalar.c.in
index fd9c00b..fa26801 100644
--- a/src/f32-hswish/scalar.c.in
+++ b/src/f32-hswish/scalar.c.in
@@ -24,25 +24,27 @@
   assert(n % sizeof(float) == 0);
 
   const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
+  const float vthree = params->scalar.three;
+  const float vsix = params->scalar.six;
+  const float vzero = 0.0f;
+  assert(vthree == 3.0f);
+  assert(vsix == 6.0f);
 
   $if BATCH_TILE > 1:
     for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
       $for N in range(BATCH_TILE):
-        const float vx${ABC[N]} = x[${N}];
+        float vx${ABC[N]} = x[${N}];
       x += ${BATCH_TILE};
 
       $for N in range(BATCH_TILE):
-        float vacc${ABC[N]} = vx${ABC[N]} * vsixth + vhalf;
+        float vacc${ABC[N]} = vx${ABC[N]} + vthree;
+        vx${ABC[N]} *= vsixth;
 
       $for N in range(BATCH_TILE):
-        vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, 0.0f);
+        vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, vzero);
 
       $for N in range(BATCH_TILE):
-        vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vone);
+        vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vsix);
 
       $for N in range(BATCH_TILE):
         vacc${ABC[N]} *= vx${ABC[N]};
@@ -54,29 +56,32 @@
     if XNN_UNLIKELY(n != 0) {
       $if BATCH_TILE > 2:
         do {
-          const float vx = *x++;
-          float vacc = vx * vsixth + vhalf;
-          vacc = ${MAX_F32}(vacc, 0.0f);
-          vacc = ${MIN_F32}(vacc, vone);
-          vacc = vacc * vx;
+          float vx = *x++;
+          float vacc = vx + vthree;
+          vx *= vsixth;
+          vacc = ${MAX_F32}(vacc, vzero);
+          vacc = ${MIN_F32}(vacc, vsix);
+          vacc *= vx;
           *y++ = vacc;
           n -= sizeof(float);
         } while (n != 0);
       $else:
-        const float vx = *x;
-        float vacc = vx * vsixth + vhalf;
-        vacc = ${MAX_F32}(vacc, 0.0f);
-        vacc = ${MIN_F32}(vacc, vone);
-        vacc = vacc * vx;
+        float vx = *x;
+        float vacc = vx + vthree;
+        vx *= vsixth;
+        vacc = ${MAX_F32}(vacc, vzero);
+        vacc = ${MIN_F32}(vacc, vsix);
+        vacc *= vx;
         *y = vacc;
     }
   $else:
     for (; n >= sizeof(float); n -= sizeof(float)) {
-      const float vx = *x++;
-      float vacc = vx * vsixth + vhalf;
-      vacc = ${MAX_F32}(vacc, 0.0f);
-      vacc = ${MIN_F32}(vacc, vone);
-      vacc = vacc * vx;
+      float vx = *x++;
+      float vacc = vx + vthree;
+      vx *= vsixth;
+      vacc = ${MAX_F32}(vacc, vzero);
+      vacc = ${MIN_F32}(vacc, vsix);
+      vacc *= vx;
       *y++ = vacc;
     }
 }
diff --git a/src/f32-hswish/wasmsimd.c.in b/src/f32-hswish/wasmsimd.c.in
index b8405d1..5854b26 100644
--- a/src/f32-hswish/wasmsimd.c.in
+++ b/src/f32-hswish/wasmsimd.c.in
@@ -14,7 +14,7 @@
 #include <xnnpack/hswish.h>
 
 
-void xnn_f32_hswish_ukernel__wasmsimd_${"x86" if X86 else "arm"}_x${BATCH_TILE}(
+void xnn_f32_hswish_ukernel__wasmsimd_x${BATCH_TILE}(
     size_t n,
     const float* x,
     float* y,
@@ -24,40 +24,29 @@
   assert(n % sizeof(float) == 0);
 
   const v128_t vsixth = wasm_v32x4_load_splat(&params->scalar.sixth);
-  const v128_t vhalf = wasm_v32x4_load_splat(&params->scalar.half);
-  const v128_t vone = wasm_v32x4_load_splat(&params->scalar.one);
+  const v128_t vthree = wasm_v32x4_load_splat(&params->scalar.three);
+  const v128_t vsix = wasm_v32x4_load_splat(&params->scalar.six);
   const v128_t vzero = wasm_f32x4_splat(0.0f);
 
   $if BATCH_TILE > 4:
     for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
-      const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
+      v128_t vx${ABC[0:4]} = wasm_v128_load(x);
       $for N in range(4, BATCH_TILE, 4):
-        const v128_t vx${ABC[N:N+4]} = wasm_v128_load(x + ${N});
+        v128_t vx${ABC[N:N+4]} = wasm_v128_load(x + ${N});
       x += ${BATCH_TILE};
 
       $for N in range(0, BATCH_TILE, 4):
-        v128_t vacc${ABC[N:N+4]} = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx${ABC[N:N+4]}, vsixth));
+        v128_t vacc${ABC[N:N+4]} = wasm_f32x4_add(vx${ABC[N:N+4]}, vthree);
+        vx${ABC[N:N+4]} = wasm_f32x4_mul(vx${ABC[N:N+4]}, vsixth);
 
-      $if X86:
-        $for N in range(0, BATCH_TILE, 4):
-          const v128_t vmasklt${ABC[N:N+4]} = wasm_f32x4_lt(vacc${ABC[N:N+4]}, vzero);
-          vacc${ABC[N:N+4]} = wasm_v128_andnot(vacc${ABC[N:N+4]}, vmasklt${ABC[N:N+4]});
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = wasm_i32x4_max(vacc${ABC[N:N+4]}, vzero);
 
-        $for N in range(0, BATCH_TILE, 4):
-          const v128_t vmaskge${ABC[N:N+4]} = wasm_f32x4_ge(vacc${ABC[N:N+4]}, vone);
-          vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = wasm_i32x4_min(vacc${ABC[N:N+4]}, vsix);
 
-        $for N in range(0, BATCH_TILE, 4):
-          vacc${ABC[N:N+4]} = wasm_v128_bitselect(vx${ABC[N:N+4]}, vacc${ABC[N:N+4]}, vmaskge${ABC[N:N+4]});
-      $else:
-        $for N in range(0, BATCH_TILE, 4):
-          vacc${ABC[N:N+4]} = wasm_f32x4_max(vacc${ABC[N:N+4]}, vzero);
-
-        $for N in range(0, BATCH_TILE, 4):
-          vacc${ABC[N:N+4]} = wasm_f32x4_min(vacc${ABC[N:N+4]}, vone);
-
-        $for N in range(0, BATCH_TILE, 4):
-          vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+      $for N in range(0, BATCH_TILE, 4):
+        vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
 
       wasm_v128_store(y, vacc${ABC[0:4]});
       $for N in range(4, BATCH_TILE, 4):
@@ -65,38 +54,26 @@
       y += ${BATCH_TILE};
     }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const v128_t vx = wasm_v128_load(x);
+    v128_t vx = wasm_v128_load(x);
     x += 4;
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
 
-    $if X86:
-      const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-      vacc = wasm_v128_andnot(vacc, vmasklt);
-      const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-      vacc = wasm_f32x4_mul(vacc, vx);
-      vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-    $else:
-      vacc = wasm_f32x4_max(vacc, vzero);
-      vacc = wasm_f32x4_min(vacc, vone);
-      vacc = wasm_f32x4_mul(vacc, vx);
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
 
     wasm_v128_store(y, vacc);
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const v128_t vx = wasm_v128_load(x);
-    v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
+    v128_t vx = wasm_v128_load(x);
 
-    $if X86:
-      const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
-      vacc = wasm_v128_andnot(vacc, vmasklt);
-      const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
-      vacc = wasm_f32x4_mul(vacc, vx);
-      vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-    $else:
-      vacc = wasm_f32x4_max(vacc, vzero);
-      vacc = wasm_f32x4_min(vacc, vone);
-      vacc = wasm_f32x4_mul(vacc, vx);
+    v128_t vacc = wasm_f32x4_add(vx, vthree);
+    vx = wasm_f32x4_mul(vx, vsixth);
+    vacc = wasm_i32x4_max(vacc, vzero);
+    vacc = wasm_i32x4_min(vacc, vsix);
+    vacc = wasm_f32x4_mul(vacc, vx);
 
     if (n & (2 * sizeof(float))) {
       *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
diff --git a/src/init.c b/src/init.c
index 5026d87..3001112 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1016,7 +1016,7 @@
     };
     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma_x8;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x8;
     xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
     xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
     xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
@@ -1835,11 +1835,10 @@
     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
     if (is_wasm_x86) {
       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
-      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x86_x16;
     } else {
       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
-      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_arm_x8;
     }
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x16;
     xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__psimd_x8;
     xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8;
     xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_x8;
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index 571efb7..130c737 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h
@@ -36,9 +36,6 @@
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x4)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x8)
 
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x8)
-
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x4)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x8)
 
@@ -51,15 +48,9 @@
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x16)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x32)
 
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x8)
-
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x8)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x16)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x8)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x16)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x8)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x16)
 
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x1)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x2)
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 9b74880..4c2f582 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -466,8 +466,8 @@
     }
   #else
     params.scalar.sixth = 0x1.555556p-3f;
-    params.scalar.half = 0.5f;
-    params.scalar.one = 1.0f;
+    params.scalar.three = 3.0f;
+    params.scalar.six = 6.0f;
   #endif
   return params;
 }
@@ -476,8 +476,8 @@
 {
   union xnn_f32_hswish_params params;
   params.scalar.sixth = 0x1.555556p-3f;
-  params.scalar.half = 0.5f;
-  params.scalar.one = 1.0f;
+  params.scalar.three = 3.0f;
+  params.scalar.six = 6.0f;
   return params;
 }
 
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c0236d5..7099158 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -209,8 +209,8 @@
 union xnn_f32_hswish_params {
   struct {
     float sixth;
-    float half;
-    float one;
+    float three;
+    float six;
   } scalar;
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   struct {
diff --git a/test/f32-hswish.cc b/test/f32-hswish.cc
index 7a5aeac..9b26996 100644
--- a/test/f32-hswish.cc
+++ b/test/f32-hswish.cc
@@ -111,100 +111,6 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_HSWISH__NEONFMA_X4, batch_eq_4) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    HSwishMicrokernelTester()
-      .batch_size(4)
-      .Test(xnn_f32_hswish_ukernel__neonfma_x4);
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X4, batch_div_4) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X4, batch_lt_4) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X4, batch_gt_4) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X4, inplace) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x4);
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_HSWISH__NEONFMA_X8, batch_eq_8) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    HSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_f32_hswish_ukernel__neonfma_x8);
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X8, batch_div_8) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X8, batch_lt_8) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X8, batch_gt_8) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__NEONFMA_X8, inplace) {
-    TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__neonfma_x8);
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_HSWISH__SSE_X4, batch_eq_4) {
     TEST_REQUIRES_X86_SSE;
@@ -581,347 +487,127 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-  TEST(F32_HSWISH__PSIMD_X4, batch_eq_4) {
-    TEST_REQUIRES_PSIMD;
-    HSwishMicrokernelTester()
-      .batch_size(4)
-      .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
-  }
-
-  TEST(F32_HSWISH__PSIMD_X4, batch_div_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X4, batch_lt_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X4, batch_gt_4) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X4, inplace) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-  TEST(F32_HSWISH__PSIMD_X8, batch_eq_8) {
-    TEST_REQUIRES_PSIMD;
-    HSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
-  }
-
-  TEST(F32_HSWISH__PSIMD_X8, batch_div_8) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X8, batch_lt_8) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X8, batch_gt_8) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_HSWISH__PSIMD_X8, inplace) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
-    }
-  }
-#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
 #if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_eq_4) {
+  TEST(F32_HSWISH__WASMSIMD_X4, batch_eq_4) {
     HSwishMicrokernelTester()
       .batch_size(4)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+      .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_div_4) {
+  TEST(F32_HSWISH__WASMSIMD_X4, batch_div_4) {
     for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_lt_4) {
+  TEST(F32_HSWISH__WASMSIMD_X4, batch_lt_4) {
     for (size_t batch_size = 1; batch_size < 4; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_gt_4) {
+  TEST(F32_HSWISH__WASMSIMD_X4, batch_gt_4) {
     for (size_t batch_size = 5; batch_size < 8; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X4, inplace) {
+  TEST(F32_HSWISH__WASMSIMD_X4, inplace) {
     for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
         .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_eq_8) {
+  TEST(F32_HSWISH__WASMSIMD_X8, batch_eq_8) {
     HSwishMicrokernelTester()
       .batch_size(8)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+      .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_div_8) {
+  TEST(F32_HSWISH__WASMSIMD_X8, batch_div_8) {
     for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_lt_8) {
+  TEST(F32_HSWISH__WASMSIMD_X8, batch_lt_8) {
     for (size_t batch_size = 1; batch_size < 8; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_gt_8) {
+  TEST(F32_HSWISH__WASMSIMD_X8, batch_gt_8) {
     for (size_t batch_size = 9; batch_size < 16; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X8, inplace) {
+  TEST(F32_HSWISH__WASMSIMD_X8, inplace) {
     for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
         .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
 
 
 #if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_eq_16) {
+  TEST(F32_HSWISH__WASMSIMD_X16, batch_eq_16) {
     HSwishMicrokernelTester()
       .batch_size(16)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+      .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_div_16) {
+  TEST(F32_HSWISH__WASMSIMD_X16, batch_div_16) {
     for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_lt_16) {
+  TEST(F32_HSWISH__WASMSIMD_X16, batch_lt_16) {
     for (size_t batch_size = 1; batch_size < 16; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_gt_16) {
+  TEST(F32_HSWISH__WASMSIMD_X16, batch_gt_16) {
     for (size_t batch_size = 17; batch_size < 32; batch_size++) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
     }
   }
 
-  TEST(F32_HSWISH__WASMSIMD_ARM_X16, inplace) {
+  TEST(F32_HSWISH__WASMSIMD_X16, inplace) {
     for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
       HSwishMicrokernelTester()
         .batch_size(batch_size)
         .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_eq_4) {
-    HSwishMicrokernelTester()
-      .batch_size(4)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_div_4) {
-    for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_lt_4) {
-    for (size_t batch_size = 1; batch_size < 4; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_gt_4) {
-    for (size_t batch_size = 5; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X4, inplace) {
-    for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_eq_8) {
-    HSwishMicrokernelTester()
-      .batch_size(8)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_div_8) {
-    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_lt_8) {
-    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_gt_8) {
-    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X8, inplace) {
-    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
-    }
-  }
-#endif  // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
-  TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_eq_16) {
-    HSwishMicrokernelTester()
-      .batch_size(16)
-      .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_div_16) {
-    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_lt_16) {
-    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_gt_16) {
-    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
-    }
-  }
-
-  TEST(F32_HSWISH__WASMSIMD_X86_X16, inplace) {
-    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
-      HSwishMicrokernelTester()
-        .batch_size(batch_size)
-        .inplace(true)
-        .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
+        .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
     }
   }
 #endif  // XNN_ARCH_WASMSIMD
diff --git a/test/f32-hswish.yaml b/test/f32-hswish.yaml
index 1c33240..781537e 100644
--- a/test/f32-hswish.yaml
+++ b/test/f32-hswish.yaml
@@ -4,8 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_hswish_ukernel__neon_x4
 - name: xnn_f32_hswish_ukernel__neon_x8
-- name: xnn_f32_hswish_ukernel__neonfma_x4
-- name: xnn_f32_hswish_ukernel__neonfma_x8
 - name: xnn_f32_hswish_ukernel__sse_x4
 - name: xnn_f32_hswish_ukernel__sse_x8
 - name: xnn_f32_hswish_ukernel__avx_x8
@@ -14,14 +12,9 @@
 - name: xnn_f32_hswish_ukernel__fma3_x16
 - name: xnn_f32_hswish_ukernel__avx512f_x16
 - name: xnn_f32_hswish_ukernel__avx512f_x32
-- name: xnn_f32_hswish_ukernel__psimd_x4
-- name: xnn_f32_hswish_ukernel__psimd_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x4
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x16
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x4
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x16
+- name: xnn_f32_hswish_ukernel__wasmsimd_x4
+- name: xnn_f32_hswish_ukernel__wasmsimd_x8
+- name: xnn_f32_hswish_ukernel__wasmsimd_x16
 - name: xnn_f32_hswish_ukernel__wasm_x1
 - name: xnn_f32_hswish_ukernel__wasm_x2
 - name: xnn_f32_hswish_ukernel__wasm_x4