Refactor HardSwish micro-kernels

- Code-generate HSWISH micro-kernels
- Support unrolling in HSWISH implementation
- Add HSWISH micro-kernels for AVX, FMA3, and AVX512F
- Code-generate HSWISH unit tests
- Switch all platforms to newer versions of the micro-kernels

PiperOrigin-RevId: 284705773
diff --git a/src/f32-hswish/avx.c.in b/src/f32-hswish/avx.c.in
new file mode 100644
index 0000000..9463b82
--- /dev/null
+++ b/src/f32-hswish/avx.c.in
@@ -0,0 +1,95 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+$ISA = {0: "avx", 3: "fma3"}[FMA]
+void xnn_f32_hswish_ukernel__${ISA}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 vsixth = _mm256_broadcast_ps((const __m128*) params->sse.sixth);
+  const __m256 vhalf = _mm256_broadcast_ps((const __m128*) params->sse.half);
+  const __m256 vone = _mm256_broadcast_ps((const __m128*) params->sse.one);
+  const __m256 vzero = _mm256_setzero_ps();
+
+  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+    const __m256 vx${ABC[0:8]} = _mm256_loadu_ps(x);
+    $for N in range(8, BATCH_TILE, 8):
+      const __m256 vx${ABC[N:N+8]} = _mm256_loadu_ps(x + ${N});
+    x += ${BATCH_TILE};
+
+    $if FMA == 3:
+      $for N in range(0, BATCH_TILE, 8):
+        __m256 vacc${ABC[N:N+8]} = _mm256_fmadd_ps(vx${ABC[N:N+8]}, vsixth, vhalf);
+    $else:
+      $for N in range(0, BATCH_TILE, 8):
+        __m256 vacc${ABC[N:N+8]} = _mm256_mul_ps(vx${ABC[N:N+8]}, vsixth);
+
+      $for N in range(0, BATCH_TILE, 8):
+        vacc${ABC[N:N+8]} = _mm256_add_ps(vacc${ABC[N:N+8]}, vhalf);
+
+    $for N in range(0, BATCH_TILE, 8):
+      vacc${ABC[N:N+8]} = _mm256_max_ps(vacc${ABC[N:N+8]}, vzero);
+
+    $for N in range(0, BATCH_TILE, 8):
+      vacc${ABC[N:N+8]} = _mm256_min_ps(vacc${ABC[N:N+8]}, vone);
+
+    $for N in range(0, BATCH_TILE, 8):
+      vacc${ABC[N:N+8]} = _mm256_mul_ps(vacc${ABC[N:N+8]}, vx${ABC[N:N+8]});
+
+    _mm256_storeu_ps(y, vacc${ABC[0:8]});
+    $for N in range(8, BATCH_TILE, 8):
+      _mm256_storeu_ps(y + ${N}, vacc${ABC[N:N+8]});
+    y += ${BATCH_TILE};
+  }
+  $if BATCH_TILE >= 8:
+    for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+      const __m256 vx = _mm256_loadu_ps(x);
+      x += 8;
+      $if FMA == 3:
+        __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+      $else:
+        __m256 vacc = _mm256_mul_ps(vx, vsixth);
+        vacc = _mm256_add_ps(vacc, vhalf);
+      vacc = _mm256_max_ps(vacc, vzero);
+      vacc = _mm256_min_ps(vacc, vone);
+      vacc = _mm256_mul_ps(vacc, vx);
+      _mm256_storeu_ps(y, vacc);
+      y += 8;
+    }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+    $if FMA == 3:
+      __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+    $else:
+      __m256 vacc = _mm256_mul_ps(vx, vsixth);
+      vacc = _mm256_add_ps(vacc, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_maskstore_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/avx512f.c.in b/src/f32-hswish/avx512f.c.in
new file mode 100644
index 0000000..4ff10ca
--- /dev/null
+++ b/src/f32-hswish/avx512f.c.in
@@ -0,0 +1,80 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 16 == 0
+$assert BATCH_TILE >= 16
+$ABC = "456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__avx512f_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m512 vsixth = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.sixth));
+  const __m512 vhalf = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.half));
+  const __m512 vone = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.one));
+  const __m512 vzero = _mm512_setzero_ps();
+
+  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+    const __m512 vx${ABC[0:16]} = _mm512_loadu_ps(x);
+    $for N in range(16, BATCH_TILE, 16):
+      const __m512 vx${ABC[N:N+16]} = _mm512_loadu_ps(x + ${N});
+    x += ${BATCH_TILE};
+
+    $for N in range(0, BATCH_TILE, 16):
+      __m512 vacc${ABC[N:N+16]} = _mm512_fmadd_ps(vx${ABC[N:N+16]}, vsixth, vhalf);
+
+    $for N in range(0, BATCH_TILE, 16):
+      vacc${ABC[N:N+16]} = _mm512_max_ps(vacc${ABC[N:N+16]}, vzero);
+
+    $for N in range(0, BATCH_TILE, 16):
+      vacc${ABC[N:N+16]} = _mm512_min_ps(vacc${ABC[N:N+16]}, vone);
+
+    $for N in range(0, BATCH_TILE, 16):
+      vacc${ABC[N:N+16]} = _mm512_mul_ps(vacc${ABC[N:N+16]}, vx${ABC[N:N+16]});
+
+    _mm512_storeu_ps(y, vacc${ABC[0:16]});
+    $for N in range(16, BATCH_TILE, 16):
+      _mm512_storeu_ps(y + ${N}, vacc${ABC[N:N+16]});
+    y += ${BATCH_TILE};
+  }
+  $if BATCH_TILE >= 16:
+    for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+      const __m512 vx = _mm512_loadu_ps(x);
+      x += 16;
+      __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+      vacc = _mm512_max_ps(vacc, vzero);
+      vacc = _mm512_min_ps(vacc, vone);
+      vacc = _mm512_mul_ps(vacc, vx);
+      _mm512_storeu_ps(y, vacc);
+      y += 16;
+    }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 16 * sizeof(float));
+    // Prepare mask for valid 32-bit elements (depends on n).
+    n >>= 2 /* log2(sizeof(float)) */;
+    const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
+
+    const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
+    __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm512_max_ps(vacc, vzero);
+    vacc = _mm512_min_ps(vacc, vone);
+    vacc = _mm512_mul_ps(vacc, vx);
+    _mm512_mask_storeu_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/avx-x16.c b/src/f32-hswish/gen/avx-x16.c
new file mode 100644
index 0000000..d4d3cfb
--- /dev/null
+++ b/src/f32-hswish/gen/avx-x16.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_hswish_ukernel__avx_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 vsixth = _mm256_broadcast_ps((const __m128*) params->sse.sixth);
+  const __m256 vhalf = _mm256_broadcast_ps((const __m128*) params->sse.half);
+  const __m256 vone = _mm256_broadcast_ps((const __m128*) params->sse.one);
+  const __m256 vzero = _mm256_setzero_ps();
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m256 vx456789AB = _mm256_loadu_ps(x);
+    const __m256 vxCDEFGHIJ = _mm256_loadu_ps(x + 8);
+    x += 16;
+
+    __m256 vacc456789AB = _mm256_mul_ps(vx456789AB, vsixth);
+    __m256 vaccCDEFGHIJ = _mm256_mul_ps(vxCDEFGHIJ, vsixth);
+
+    vacc456789AB = _mm256_add_ps(vacc456789AB, vhalf);
+    vaccCDEFGHIJ = _mm256_add_ps(vaccCDEFGHIJ, vhalf);
+
+    vacc456789AB = _mm256_max_ps(vacc456789AB, vzero);
+    vaccCDEFGHIJ = _mm256_max_ps(vaccCDEFGHIJ, vzero);
+
+    vacc456789AB = _mm256_min_ps(vacc456789AB, vone);
+    vaccCDEFGHIJ = _mm256_min_ps(vaccCDEFGHIJ, vone);
+
+    vacc456789AB = _mm256_mul_ps(vacc456789AB, vx456789AB);
+    vaccCDEFGHIJ = _mm256_mul_ps(vaccCDEFGHIJ, vxCDEFGHIJ);
+
+    _mm256_storeu_ps(y, vacc456789AB);
+    _mm256_storeu_ps(y + 8, vaccCDEFGHIJ);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(x);
+    x += 8;
+    __m256 vacc = _mm256_mul_ps(vx, vsixth);
+    vacc = _mm256_add_ps(vacc, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_storeu_ps(y, vacc);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+    __m256 vacc = _mm256_mul_ps(vx, vsixth);
+    vacc = _mm256_add_ps(vacc, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_maskstore_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/avx-x8.c b/src/f32-hswish/gen/avx-x8.c
new file mode 100644
index 0000000..d0f6682
--- /dev/null
+++ b/src/f32-hswish/gen/avx-x8.c
@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_hswish_ukernel__avx_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 vsixth = _mm256_broadcast_ps((const __m128*) params->sse.sixth);
+  const __m256 vhalf = _mm256_broadcast_ps((const __m128*) params->sse.half);
+  const __m256 vone = _mm256_broadcast_ps((const __m128*) params->sse.one);
+  const __m256 vzero = _mm256_setzero_ps();
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx456789AB = _mm256_loadu_ps(x);
+    x += 8;
+
+    __m256 vacc456789AB = _mm256_mul_ps(vx456789AB, vsixth);
+
+    vacc456789AB = _mm256_add_ps(vacc456789AB, vhalf);
+
+    vacc456789AB = _mm256_max_ps(vacc456789AB, vzero);
+
+    vacc456789AB = _mm256_min_ps(vacc456789AB, vone);
+
+    vacc456789AB = _mm256_mul_ps(vacc456789AB, vx456789AB);
+
+    _mm256_storeu_ps(y, vacc456789AB);
+    y += 8;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(x);
+    x += 8;
+    __m256 vacc = _mm256_mul_ps(vx, vsixth);
+    vacc = _mm256_add_ps(vacc, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_storeu_ps(y, vacc);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+    __m256 vacc = _mm256_mul_ps(vx, vsixth);
+    vacc = _mm256_add_ps(vacc, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_maskstore_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/avx512f-x16.c b/src/f32-hswish/gen/avx512f-x16.c
new file mode 100644
index 0000000..d98ab02
--- /dev/null
+++ b/src/f32-hswish/gen/avx512f-x16.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx512f.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__avx512f_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m512 vsixth = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.sixth));
+  const __m512 vhalf = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.half));
+  const __m512 vone = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.one));
+  const __m512 vzero = _mm512_setzero_ps();
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m512 vx456789ABCDEFGHIJ = _mm512_loadu_ps(x);
+    x += 16;
+
+    __m512 vacc456789ABCDEFGHIJ = _mm512_fmadd_ps(vx456789ABCDEFGHIJ, vsixth, vhalf);
+
+    vacc456789ABCDEFGHIJ = _mm512_max_ps(vacc456789ABCDEFGHIJ, vzero);
+
+    vacc456789ABCDEFGHIJ = _mm512_min_ps(vacc456789ABCDEFGHIJ, vone);
+
+    vacc456789ABCDEFGHIJ = _mm512_mul_ps(vacc456789ABCDEFGHIJ, vx456789ABCDEFGHIJ);
+
+    _mm512_storeu_ps(y, vacc456789ABCDEFGHIJ);
+    y += 16;
+  }
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m512 vx = _mm512_loadu_ps(x);
+    x += 16;
+    __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm512_max_ps(vacc, vzero);
+    vacc = _mm512_min_ps(vacc, vone);
+    vacc = _mm512_mul_ps(vacc, vx);
+    _mm512_storeu_ps(y, vacc);
+    y += 16;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 16 * sizeof(float));
+    // Prepare mask for valid 32-bit elements (depends on n).
+    n >>= 2 /* log2(sizeof(float)) */;
+    const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
+
+    const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
+    __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm512_max_ps(vacc, vzero);
+    vacc = _mm512_min_ps(vacc, vone);
+    vacc = _mm512_mul_ps(vacc, vx);
+    _mm512_mask_storeu_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/avx512f-x32.c b/src/f32-hswish/gen/avx512f-x32.c
new file mode 100644
index 0000000..438a3ee
--- /dev/null
+++ b/src/f32-hswish/gen/avx512f-x32.c
@@ -0,0 +1,78 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx512f.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__avx512f_x32(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m512 vsixth = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.sixth));
+  const __m512 vhalf = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.half));
+  const __m512 vone = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.one));
+  const __m512 vzero = _mm512_setzero_ps();
+
+  for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
+    const __m512 vx456789ABCDEFGHIJ = _mm512_loadu_ps(x);
+    const __m512 vxKLMNOPQRSTUVWXYZ = _mm512_loadu_ps(x + 16);
+    x += 32;
+
+    __m512 vacc456789ABCDEFGHIJ = _mm512_fmadd_ps(vx456789ABCDEFGHIJ, vsixth, vhalf);
+    __m512 vaccKLMNOPQRSTUVWXYZ = _mm512_fmadd_ps(vxKLMNOPQRSTUVWXYZ, vsixth, vhalf);
+
+    vacc456789ABCDEFGHIJ = _mm512_max_ps(vacc456789ABCDEFGHIJ, vzero);
+    vaccKLMNOPQRSTUVWXYZ = _mm512_max_ps(vaccKLMNOPQRSTUVWXYZ, vzero);
+
+    vacc456789ABCDEFGHIJ = _mm512_min_ps(vacc456789ABCDEFGHIJ, vone);
+    vaccKLMNOPQRSTUVWXYZ = _mm512_min_ps(vaccKLMNOPQRSTUVWXYZ, vone);
+
+    vacc456789ABCDEFGHIJ = _mm512_mul_ps(vacc456789ABCDEFGHIJ, vx456789ABCDEFGHIJ);
+    vaccKLMNOPQRSTUVWXYZ = _mm512_mul_ps(vaccKLMNOPQRSTUVWXYZ, vxKLMNOPQRSTUVWXYZ);
+
+    _mm512_storeu_ps(y, vacc456789ABCDEFGHIJ);
+    _mm512_storeu_ps(y + 16, vaccKLMNOPQRSTUVWXYZ);
+    y += 32;
+  }
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m512 vx = _mm512_loadu_ps(x);
+    x += 16;
+    __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm512_max_ps(vacc, vzero);
+    vacc = _mm512_min_ps(vacc, vone);
+    vacc = _mm512_mul_ps(vacc, vx);
+    _mm512_storeu_ps(y, vacc);
+    y += 16;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 16 * sizeof(float));
+    // Prepare mask for valid 32-bit elements (depends on n).
+    n >>= 2 /* log2(sizeof(float)) */;
+    const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << n) - UINT32_C(1)));
+
+    const __m512 vx = _mm512_maskz_loadu_ps(vmask, x);
+    __m512 vacc = _mm512_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm512_max_ps(vacc, vzero);
+    vacc = _mm512_min_ps(vacc, vone);
+    vacc = _mm512_mul_ps(vacc, vx);
+    _mm512_mask_storeu_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/fma3-x16.c b/src/f32-hswish/gen/fma3-x16.c
new file mode 100644
index 0000000..3c17c74
--- /dev/null
+++ b/src/f32-hswish/gen/fma3-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_hswish_ukernel__fma3_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 vsixth = _mm256_broadcast_ps((const __m128*) params->sse.sixth);
+  const __m256 vhalf = _mm256_broadcast_ps((const __m128*) params->sse.half);
+  const __m256 vone = _mm256_broadcast_ps((const __m128*) params->sse.one);
+  const __m256 vzero = _mm256_setzero_ps();
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const __m256 vx456789AB = _mm256_loadu_ps(x);
+    const __m256 vxCDEFGHIJ = _mm256_loadu_ps(x + 8);
+    x += 16;
+
+    __m256 vacc456789AB = _mm256_fmadd_ps(vx456789AB, vsixth, vhalf);
+    __m256 vaccCDEFGHIJ = _mm256_fmadd_ps(vxCDEFGHIJ, vsixth, vhalf);
+
+    vacc456789AB = _mm256_max_ps(vacc456789AB, vzero);
+    vaccCDEFGHIJ = _mm256_max_ps(vaccCDEFGHIJ, vzero);
+
+    vacc456789AB = _mm256_min_ps(vacc456789AB, vone);
+    vaccCDEFGHIJ = _mm256_min_ps(vaccCDEFGHIJ, vone);
+
+    vacc456789AB = _mm256_mul_ps(vacc456789AB, vx456789AB);
+    vaccCDEFGHIJ = _mm256_mul_ps(vaccCDEFGHIJ, vxCDEFGHIJ);
+
+    _mm256_storeu_ps(y, vacc456789AB);
+    _mm256_storeu_ps(y + 8, vaccCDEFGHIJ);
+    y += 16;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(x);
+    x += 8;
+    __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_storeu_ps(y, vacc);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+    __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_maskstore_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/fma3-x8.c b/src/f32-hswish/gen/fma3-x8.c
new file mode 100644
index 0000000..576662e
--- /dev/null
+++ b/src/f32-hswish/gen/fma3-x8.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/avx.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
+
+void xnn_f32_hswish_ukernel__fma3_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m256 vsixth = _mm256_broadcast_ps((const __m128*) params->sse.sixth);
+  const __m256 vhalf = _mm256_broadcast_ps((const __m128*) params->sse.half);
+  const __m256 vone = _mm256_broadcast_ps((const __m128*) params->sse.one);
+  const __m256 vzero = _mm256_setzero_ps();
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx456789AB = _mm256_loadu_ps(x);
+    x += 8;
+
+    __m256 vacc456789AB = _mm256_fmadd_ps(vx456789AB, vsixth, vhalf);
+
+    vacc456789AB = _mm256_max_ps(vacc456789AB, vzero);
+
+    vacc456789AB = _mm256_min_ps(vacc456789AB, vone);
+
+    vacc456789AB = _mm256_mul_ps(vacc456789AB, vx456789AB);
+
+    _mm256_storeu_ps(y, vacc456789AB);
+    y += 8;
+  }
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(x);
+    x += 8;
+    __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_storeu_ps(y, vacc);
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    assert(n >= 1 * sizeof(float));
+    assert(n <= 7 * sizeof(float));
+    __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - n));
+
+    const __m256 vx = _mm256_maskload_ps(x, vmask);
+    __m256 vacc = _mm256_fmadd_ps(vx, vsixth, vhalf);
+    vacc = _mm256_max_ps(vacc, vzero);
+    vacc = _mm256_min_ps(vacc, vone);
+    vacc = _mm256_mul_ps(vacc, vx);
+    _mm256_maskstore_ps(y, vmask, vacc);
+  }
+}
diff --git a/src/f32-hswish/gen/neon-x4.c b/src/f32-hswish/gen/neon-x4.c
new file mode 100644
index 0000000..572a8ac
--- /dev/null
+++ b/src/f32-hswish/gen/neon-x4.c
@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__neon_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
+  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
+  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+
+    vacc0123 = vminq_f32(vacc0123, vone);
+
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx0123 = vld1q_f32(x);
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vacc01); y += 2;
+      vacc01 = vget_high_f32(vacc0123);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vacc01, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/neon-x8.c b/src/f32-hswish/gen/neon-x8.c
new file mode 100644
index 0000000..c13d648
--- /dev/null
+++ b/src/f32-hswish/gen/neon-x8.c
@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__neon_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
+  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
+  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    float32x4_t vacc4567 = vmlaq_f32(vhalf, vx4567, vsixth);
+
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc4567 = vmaxq_f32(vacc4567, vzero);
+
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc4567 = vminq_f32(vacc4567, vone);
+
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vacc4567 = vmulq_f32(vacc4567, vx4567);
+
+    vst1q_f32(y, vacc0123); y += 4;
+    vst1q_f32(y, vacc4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx0123 = vld1q_f32(x);
+    float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vacc01); y += 2;
+      vacc01 = vget_high_f32(vacc0123);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vacc01, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/neonfma-x4.c b/src/f32-hswish/gen/neonfma-x4.c
new file mode 100644
index 0000000..f570133
--- /dev/null
+++ b/src/f32-hswish/gen/neonfma-x4.c
@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__neonfma_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
+  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
+  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+
+    vacc0123 = vminq_f32(vacc0123, vone);
+
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx0123 = vld1q_f32(x);
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vacc01); y += 2;
+      vacc01 = vget_high_f32(vacc0123);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vacc01, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/neonfma-x8.c b/src/f32-hswish/gen/neonfma-x8.c
new file mode 100644
index 0000000..9d45646
--- /dev/null
+++ b/src/f32-hswish/gen/neonfma-x8.c
@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__neonfma_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
+  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
+  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    float32x4_t vacc4567 = vfmaq_f32(vhalf, vx4567, vsixth);
+
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc4567 = vmaxq_f32(vacc4567, vzero);
+
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc4567 = vminq_f32(vacc4567, vone);
+
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vacc4567 = vmulq_f32(vacc4567, vx4567);
+
+    vst1q_f32(y, vacc0123); y += 4;
+    vst1q_f32(y, vacc4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+    vst1q_f32(y, vacc0123); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx0123 = vld1q_f32(x);
+    float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vacc01); y += 2;
+      vacc01 = vget_high_f32(vacc0123);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vacc01, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/psimd-x4.c b/src/f32-hswish/gen/psimd-x4.c
new file mode 100644
index 0000000..6e1930d
--- /dev/null
+++ b/src/f32-hswish/gen/psimd-x4.c
@@ -0,0 +1,73 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__psimd_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
+  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
+  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
+  const psimd_f32 vzero = psimd_splat_f32(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    x += 4;
+
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+
+    psimd_store_f32(y, vacc0123);
+    y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    x += 4;
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+    psimd_store_f32(y, vacc0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vacc0123);
+      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/psimd-x8.c b/src/f32-hswish/gen/psimd-x8.c
new file mode 100644
index 0000000..621228c
--- /dev/null
+++ b/src/f32-hswish/gen/psimd-x8.c
@@ -0,0 +1,79 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__psimd_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
+  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
+  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
+  const psimd_f32 vzero = psimd_splat_f32(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    x += 8;
+
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    psimd_f32 vacc4567 = psimd_qfma_f32(vhalf, vx4567, vsixth);
+
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc4567 = psimd_max_f32(vacc4567, vzero);
+
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc4567 = psimd_min_f32(vacc4567, vone);
+
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+    vacc4567 = psimd_mul_f32(vacc4567, vx4567);
+
+    psimd_store_f32(y, vacc0123);
+    psimd_store_f32(y + 4, vacc4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    x += 4;
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+    psimd_store_f32(y, vacc0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vacc0123);
+      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/scalar-x1.c b/src/f32-hswish/gen/scalar-x1.c
new file mode 100644
index 0000000..c6d80b9
--- /dev/null
+++ b/src/f32-hswish/gen/scalar-x1.c
@@ -0,0 +1,40 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__scalar_x1(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float vx = *x++;
+    float vacc = vx * vsixth + vhalf;
+    vacc = math_max_f32(vacc, 0.0f);
+    vacc = math_min_f32(vacc, vone);
+    vacc = vacc * vx;
+    *y++ = vacc;
+  }
+}
diff --git a/src/f32-hswish/gen/scalar-x2.c b/src/f32-hswish/gen/scalar-x2.c
new file mode 100644
index 0000000..6f8d58e
--- /dev/null
+++ b/src/f32-hswish/gen/scalar-x2.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__scalar_x2(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float vx0 = x[0];
+    const float vx1 = x[1];
+    x += 2;
+
+    float vacc0 = vx0 * vsixth + vhalf;
+    float vacc1 = vx1 * vsixth + vhalf;
+
+    vacc0 = math_max_f32(vacc0, 0.0f);
+    vacc1 = math_max_f32(vacc1, 0.0f);
+
+    vacc0 = math_min_f32(vacc0, vone);
+    vacc1 = math_min_f32(vacc1, vone);
+
+    vacc0 *= vx0;
+    vacc1 *= vx1;
+
+    y[0] = vacc0;
+    y[1] = vacc1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float vx = *x;
+    float vacc = vx * vsixth + vhalf;
+    vacc = math_max_f32(vacc, 0.0f);
+    vacc = math_min_f32(vacc, vone);
+    vacc = vacc * vx;
+    *y = vacc;
+  }
+}
diff --git a/src/f32-hswish/gen/scalar-x4.c b/src/f32-hswish/gen/scalar-x4.c
new file mode 100644
index 0000000..e9fdff0
--- /dev/null
+++ b/src/f32-hswish/gen/scalar-x4.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__scalar_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float vx0 = x[0];
+    const float vx1 = x[1];
+    const float vx2 = x[2];
+    const float vx3 = x[3];
+    x += 4;
+
+    float vacc0 = vx0 * vsixth + vhalf;
+    float vacc1 = vx1 * vsixth + vhalf;
+    float vacc2 = vx2 * vsixth + vhalf;
+    float vacc3 = vx3 * vsixth + vhalf;
+
+    vacc0 = math_max_f32(vacc0, 0.0f);
+    vacc1 = math_max_f32(vacc1, 0.0f);
+    vacc2 = math_max_f32(vacc2, 0.0f);
+    vacc3 = math_max_f32(vacc3, 0.0f);
+
+    vacc0 = math_min_f32(vacc0, vone);
+    vacc1 = math_min_f32(vacc1, vone);
+    vacc2 = math_min_f32(vacc2, vone);
+    vacc3 = math_min_f32(vacc3, vone);
+
+    vacc0 *= vx0;
+    vacc1 *= vx1;
+    vacc2 *= vx2;
+    vacc3 *= vx3;
+
+    y[0] = vacc0;
+    y[1] = vacc1;
+    y[2] = vacc2;
+    y[3] = vacc3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float vx = *x++;
+      float vacc = vx * vsixth + vhalf;
+      vacc = math_max_f32(vacc, 0.0f);
+      vacc = math_min_f32(vacc, vone);
+      vacc = vacc * vx;
+      *y++ = vacc;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-hswish/gen/sse-x4.c b/src/f32-hswish/gen/sse-x4.c
new file mode 100644
index 0000000..25d8c27
--- /dev/null
+++ b/src/f32-hswish/gen/sse-x4.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__sse_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vsixth = _mm_load_ps(params->sse.sixth);
+  const __m128 vhalf = _mm_load_ps(params->sse.half);
+  const __m128 vone = _mm_load_ps(params->sse.one);
+  const __m128 vzero = _mm_setzero_ps();
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    x += 4;
+
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+
+    _mm_storeu_ps(y, vacc0123);
+    y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    x += 4;
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+    _mm_storeu_ps(y, vacc0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vacc0123);
+      vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/sse-x8.c b/src/f32-hswish/gen/sse-x8.c
new file mode 100644
index 0000000..bd0b354
--- /dev/null
+++ b/src/f32-hswish/gen/sse-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__sse_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vsixth = _mm_load_ps(params->sse.sixth);
+  const __m128 vhalf = _mm_load_ps(params->sse.half);
+  const __m128 vone = _mm_load_ps(params->sse.one);
+  const __m128 vzero = _mm_setzero_ps();
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    x += 8;
+
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    __m128 vacc4567 = _mm_mul_ps(vx4567, vsixth);
+
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc4567 = _mm_add_ps(vacc4567, vhalf);
+
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc4567 = _mm_max_ps(vacc4567, vzero);
+
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc4567 = _mm_min_ps(vacc4567, vone);
+
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+    vacc4567 = _mm_mul_ps(vacc4567, vx4567);
+
+    _mm_storeu_ps(y, vacc0123);
+    _mm_storeu_ps(y + 4, vacc4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    x += 4;
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+    _mm_storeu_ps(y, vacc0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vacc0123);
+      vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/gen/wasm-x1.c b/src/f32-hswish/gen/wasm-x1.c
new file mode 100644
index 0000000..0914018
--- /dev/null
+++ b/src/f32-hswish/gen/wasm-x1.c
@@ -0,0 +1,40 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__wasm_x1(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float vx = *x++;
+    float vacc = vx * vsixth + vhalf;
+    vacc = __builtin_wasm_max_f32(vacc, 0.0f);
+    vacc = __builtin_wasm_min_f32(vacc, vone);
+    vacc = vacc * vx;
+    *y++ = vacc;
+  }
+}
diff --git a/src/f32-hswish/gen/wasm-x2.c b/src/f32-hswish/gen/wasm-x2.c
new file mode 100644
index 0000000..bbb188d
--- /dev/null
+++ b/src/f32-hswish/gen/wasm-x2.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__wasm_x2(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float vx0 = x[0];
+    const float vx1 = x[1];
+    x += 2;
+
+    float vacc0 = vx0 * vsixth + vhalf;
+    float vacc1 = vx1 * vsixth + vhalf;
+
+    vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
+    vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
+
+    vacc0 = __builtin_wasm_min_f32(vacc0, vone);
+    vacc1 = __builtin_wasm_min_f32(vacc1, vone);
+
+    vacc0 *= vx0;
+    vacc1 *= vx1;
+
+    y[0] = vacc0;
+    y[1] = vacc1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float vx = *x;
+    float vacc = vx * vsixth + vhalf;
+    vacc = __builtin_wasm_max_f32(vacc, 0.0f);
+    vacc = __builtin_wasm_min_f32(vacc, vone);
+    vacc = vacc * vx;
+    *y = vacc;
+  }
+}
diff --git a/src/f32-hswish/gen/wasm-x4.c b/src/f32-hswish/gen/wasm-x4.c
new file mode 100644
index 0000000..483688f
--- /dev/null
+++ b/src/f32-hswish/gen/wasm-x4.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-hswish/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__wasm_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float vx0 = x[0];
+    const float vx1 = x[1];
+    const float vx2 = x[2];
+    const float vx3 = x[3];
+    x += 4;
+
+    float vacc0 = vx0 * vsixth + vhalf;
+    float vacc1 = vx1 * vsixth + vhalf;
+    float vacc2 = vx2 * vsixth + vhalf;
+    float vacc3 = vx3 * vsixth + vhalf;
+
+    vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
+    vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
+    vacc2 = __builtin_wasm_max_f32(vacc2, 0.0f);
+    vacc3 = __builtin_wasm_max_f32(vacc3, 0.0f);
+
+    vacc0 = __builtin_wasm_min_f32(vacc0, vone);
+    vacc1 = __builtin_wasm_min_f32(vacc1, vone);
+    vacc2 = __builtin_wasm_min_f32(vacc2, vone);
+    vacc3 = __builtin_wasm_min_f32(vacc3, vone);
+
+    vacc0 *= vx0;
+    vacc1 *= vx1;
+    vacc2 *= vx2;
+    vacc3 *= vx3;
+
+    y[0] = vacc0;
+    y[1] = vacc1;
+    y[2] = vacc2;
+    y[3] = vacc3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float vx = *x++;
+      float vacc = vx * vsixth + vhalf;
+      vacc = __builtin_wasm_max_f32(vacc, 0.0f);
+      vacc = __builtin_wasm_min_f32(vacc, vone);
+      vacc = vacc * vx;
+      *y++ = vacc;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-hswish/neon.c b/src/f32-hswish/neon.c
deleted file mode 100644
index 309c15b..0000000
--- a/src/f32-hswish/neon.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neon(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
-
-  for (; n >= 16; n -= 16) {
-    const float32x4_t vx = vld1q_f32(x); x += 4;
-
-    const float32x4_t vt = vminq_f32(vmaxq_f32(vmlaq_f32(vhalf, vx, vsixth), vzero), vone);
-    const float32x4_t vy = vmulq_f32(vt, vx);
-
-    vst1q_f32(y, vy); y += 4;
-  }
-  if (n != 0) {
-    const float32x4_t vx = vld1q_f32(x); x += 4;
-
-    const float32x4_t vt = vminq_f32(vmaxq_f32(vmlaq_f32(vhalf, vx, vsixth), vzero), vone);
-    const float32x4_t vy = vmulq_f32(vt, vx);
-
-    float32x2_t vy_lo = vget_low_f32(vy);
-    if (n & 8) {
-      vst1_f32(y, vy_lo); y += 2;
-      vy_lo = vget_high_f32(vy);
-    }
-    if (n & 4) {
-      vst1_lane_f32(y, vy_lo, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/neon.c.in b/src/f32-hswish/neon.c.in
new file mode 100644
index 0000000..0388dbc
--- /dev/null
+++ b/src/f32-hswish/neon.c.in
@@ -0,0 +1,84 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__${"neonfma" if FMA else "neon"}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
+  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
+  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+
+  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+    $for N in range(0, BATCH_TILE, 4):
+      const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
+
+    $for N in range(0, BATCH_TILE, 4):
+      $if FMA:
+        float32x4_t vacc${ABC[N:N+4]} = vfmaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
+      $else:
+        float32x4_t vacc${ABC[N:N+4]} = vmlaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = vmaxq_f32(vacc${ABC[N:N+4]}, vzero);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = vminq_f32(vacc${ABC[N:N+4]}, vone);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+
+    $for N in range(0, BATCH_TILE, 4):
+      vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4;
+  }
+  $if BATCH_TILE >= 4:
+    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+      const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+      $if FMA:
+        float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+      $else:
+        float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+      vacc0123 = vmaxq_f32(vacc0123, vzero);
+      vacc0123 = vminq_f32(vacc0123, vone);
+      vacc0123 = vmulq_f32(vacc0123, vx0123);
+      vst1q_f32(y, vacc0123); y += 4;
+    }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx0123 = vld1q_f32(x);
+    $if FMA:
+      float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
+    $else:
+      float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
+    vacc0123 = vmaxq_f32(vacc0123, vzero);
+    vacc0123 = vminq_f32(vacc0123, vone);
+    vacc0123 = vmulq_f32(vacc0123, vx0123);
+
+    float32x2_t vacc01 = vget_low_f32(vacc0123);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vacc01); y += 2;
+      vacc01 = vget_high_f32(vacc0123);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vacc01, 0);
+    }
+  }
+}
diff --git a/src/f32-hswish/neonfma.c b/src/f32-hswish/neonfma.c
deleted file mode 100644
index 3e76bd9..0000000
--- a/src/f32-hswish/neonfma.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neonfma(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float32x4_t vsixth = vld1q_dup_f32(&params->scalar.sixth);
-  const float32x4_t vhalf = vld1q_dup_f32(&params->scalar.half);
-  const float32x4_t vone = vld1q_dup_f32(&params->scalar.one);
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
-
-  for (; n >= 16; n -= 16) {
-    const float32x4_t vx = vld1q_f32(x); x += 4;
-
-    const float32x4_t vt = vminq_f32(vmaxq_f32(vfmaq_f32(vhalf, vx, vsixth), vzero), vone);
-    const float32x4_t vy = vmulq_f32(vt, vx);
-
-    vst1q_f32(y, vy); y += 4;
-  }
-  if (n != 0) {
-    const float32x4_t vx = vld1q_f32(x); x += 4;
-
-    const float32x4_t vt = vminq_f32(vmaxq_f32(vfmaq_f32(vhalf, vx, vsixth), vzero), vone);
-    const float32x4_t vy = vmulq_f32(vt, vx);
-
-    float32x2_t vy_lo = vget_low_f32(vy);
-    if (n & 8) {
-      vst1_f32(y, vy_lo); y += 2;
-      vy_lo = vget_high_f32(vy);
-    }
-    if (n & 4) {
-      vst1_lane_f32(y, vy_lo, 0);
-    }
-  }
-}
diff --git a/src/f32-hswish/psimd.c b/src/f32-hswish/psimd.c
deleted file mode 100644
index b285d34..0000000
--- a/src/f32-hswish/psimd.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
-  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
-  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
-  const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
-  for (; n >= 16; n -= 16) {
-    const psimd_f32 vx = psimd_load_f32(x);
-    x += 4;
-
-    const psimd_f32 vt = psimd_min_f32(psimd_max_f32(psimd_add_f32(psimd_mul_f32(vx, vsixth), vhalf), vzero), vone);
-    const psimd_f32 vy = psimd_mul_f32(vt, vx);
-
-    psimd_store_f32(y, vy);
-    y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const psimd_f32 vx = psimd_load_f32(x);
-    x += 4;
-
-    const psimd_f32 vt = psimd_min_f32(psimd_max_f32(psimd_add_f32(psimd_mul_f32(vx, vsixth), vhalf), vzero), vone);
-    psimd_f32 vy = psimd_mul_f32(vt, vx);
-
-    if (n & 8) {
-      psimd_store2_f32(y, vy);
-      vy = psimd_concat_hi_f32(vy, vy);
-      y += 2;
-    }
-    if (n & 4) {
-      psimd_store1_f32(y, vy);
-    }
-  }
-}
diff --git a/src/f32-hswish/psimd.c.in b/src/f32-hswish/psimd.c.in
new file mode 100644
index 0000000..c20089b
--- /dev/null
+++ b/src/f32-hswish/psimd.c.in
@@ -0,0 +1,81 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__psimd_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vsixth = psimd_load_splat_f32(&params->scalar.sixth);
+  const psimd_f32 vhalf = psimd_load_splat_f32(&params->scalar.half);
+  const psimd_f32 vone = psimd_load_splat_f32(&params->scalar.one);
+  const psimd_f32 vzero = psimd_splat_f32(0.0f);
+
+  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+    const psimd_f32 vx${ABC[0:4]} = psimd_load_f32(x);
+    $for N in range(4, BATCH_TILE, 4):
+      const psimd_f32 vx${ABC[N:N+4]} = psimd_load_f32(x + ${N});
+    x += ${BATCH_TILE};
+
+    $for N in range(0, BATCH_TILE, 4):
+      psimd_f32 vacc${ABC[N:N+4]} = psimd_qfma_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = psimd_max_f32(vacc${ABC[N:N+4]}, vzero);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = psimd_min_f32(vacc${ABC[N:N+4]}, vone);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = psimd_mul_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+
+    psimd_store_f32(y, vacc${ABC[0:4]});
+    $for N in range(4, BATCH_TILE, 4):
+      psimd_store_f32(y + ${N}, vacc${ABC[N:N+4]});
+    y += ${BATCH_TILE};
+  }
+  $if BATCH_TILE >= 4:
+    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+      const psimd_f32 vx0123 = psimd_load_f32(x);
+      x += 4;
+      psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+      vacc0123 = psimd_max_f32(vacc0123, vzero);
+      vacc0123 = psimd_min_f32(vacc0123, vone);
+      vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+      psimd_store_f32(y, vacc0123);
+      y += 4;
+    }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
+    vacc0123 = psimd_max_f32(vacc0123, vzero);
+    vacc0123 = psimd_min_f32(vacc0123, vone);
+    vacc0123 = psimd_mul_f32(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vacc0123);
+      vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/scalar.c b/src/f32-hswish/scalar.c
deleted file mode 100644
index ace4fca..0000000
--- a/src/f32-hswish/scalar.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/hswish.h>
-#include <xnnpack/math.h>
-
-
-void xnn_f32_hswish_ukernel__scalar(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
-
-  do {
-    const float vx = *x++;
-
-    const float vt = math_min_f32(math_max_f32(vx * vsixth + vhalf, 0.0f), vone);
-    const float vy = vt * vx;
-
-    *y++ = vy;
-    n -= 4;
-  } while (n != 0);
-}
diff --git a/src/f32-hswish/scalar.c.in b/src/f32-hswish/scalar.c.in
new file mode 100644
index 0000000..d029eba
--- /dev/null
+++ b/src/f32-hswish/scalar.c.in
@@ -0,0 +1,82 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE >= 1
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_hswish_ukernel__${"wasm" if WASM else "scalar"}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  $if BATCH_TILE > 1:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      $for N in range(BATCH_TILE):
+        const float vx${ABC[N]} = x[${N}];
+      x += ${BATCH_TILE};
+
+      $for N in range(BATCH_TILE):
+        float vacc${ABC[N]} = vx${ABC[N]} * vsixth + vhalf;
+
+      $for N in range(BATCH_TILE):
+        vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, 0.0f);
+
+      $for N in range(BATCH_TILE):
+        vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vone);
+
+      $for N in range(BATCH_TILE):
+        vacc${ABC[N]} *= vx${ABC[N]};
+
+      $for N in range(BATCH_TILE):
+        y[${N}] = vacc${ABC[N]};
+      y += ${BATCH_TILE};
+    }
+    if XNN_UNLIKELY(n != 0) {
+      $if BATCH_TILE > 2:
+        do {
+          const float vx = *x++;
+          float vacc = vx * vsixth + vhalf;
+          vacc = ${MAX_F32}(vacc, 0.0f);
+          vacc = ${MIN_F32}(vacc, vone);
+          vacc = vacc * vx;
+          *y++ = vacc;
+          n -= sizeof(float);
+        } while (n != 0);
+      $else:
+        const float vx = *x;
+        float vacc = vx * vsixth + vhalf;
+        vacc = ${MAX_F32}(vacc, 0.0f);
+        vacc = ${MIN_F32}(vacc, vone);
+        vacc = vacc * vx;
+        *y = vacc;
+    }
+  $else:
+    for (; n >= sizeof(float); n -= sizeof(float)) {
+      const float vx = *x++;
+      float vacc = vx * vsixth + vhalf;
+      vacc = ${MAX_F32}(vacc, 0.0f);
+      vacc = ${MIN_F32}(vacc, vone);
+      vacc = vacc * vx;
+      *y++ = vacc;
+    }
+}
diff --git a/src/f32-hswish/sse.c b/src/f32-hswish/sse.c
deleted file mode 100644
index d8b7af1..0000000
--- a/src/f32-hswish/sse.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__sse(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const __m128 vsixth = _mm_load_ps(params->sse.sixth);
-  const __m128 vhalf = _mm_load_ps(params->sse.half);
-  const __m128 vone = _mm_load_ps(params->sse.one);
-  const __m128 vzero = _mm_setzero_ps();
-
-  for (; n >= 16; n -= 16) {
-    const __m128 vx = _mm_loadu_ps(x);
-    x += 4;
-
-    const __m128 vt = _mm_min_ps(_mm_max_ps(_mm_add_ps(_mm_mul_ps(vx, vsixth), vhalf), vzero), vone);
-    const __m128 vy = _mm_mul_ps(vt, vx);
-
-    _mm_storeu_ps(y, vy);
-    y += 4;
-  }
-  if (n != 0) {
-    const __m128 vx = _mm_loadu_ps(x);
-    x += 4;
-
-    const __m128 vt = _mm_min_ps(_mm_max_ps(_mm_add_ps(_mm_mul_ps(vx, vsixth), vhalf), vzero), vone);
-    __m128 vy = _mm_mul_ps(vt, vx);
-
-    if (n & 8) {
-      _mm_storel_pi((__m64*) y, vy);
-      vy = _mm_movehl_ps(vy, vy);
-      y += 2;
-    }
-    if (n & 4) {
-      _mm_store_ss(y, vy);
-    }
-  }
-}
diff --git a/src/f32-hswish/sse.c.in b/src/f32-hswish/sse.c.in
new file mode 100644
index 0000000..35b9701
--- /dev/null
+++ b/src/f32-hswish/sse.c.in
@@ -0,0 +1,86 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_hswish_ukernel__sse_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vsixth = _mm_load_ps(params->sse.sixth);
+  const __m128 vhalf = _mm_load_ps(params->sse.half);
+  const __m128 vone = _mm_load_ps(params->sse.one);
+  const __m128 vzero = _mm_setzero_ps();
+
+  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+    const __m128 vx${ABC[0:4]} = _mm_loadu_ps(x);
+    $for N in range(4, BATCH_TILE, 4):
+      const __m128 vx${ABC[N:N+4]} = _mm_loadu_ps(x + ${N});
+    x += ${BATCH_TILE};
+
+    $for N in range(0, BATCH_TILE, 4):
+      __m128 vacc${ABC[N:N+4]} = _mm_mul_ps(vx${ABC[N:N+4]}, vsixth);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = _mm_add_ps(vacc${ABC[N:N+4]}, vhalf);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = _mm_max_ps(vacc${ABC[N:N+4]}, vzero);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = _mm_min_ps(vacc${ABC[N:N+4]}, vone);
+
+    $for N in range(0, BATCH_TILE, 4):
+      vacc${ABC[N:N+4]} = _mm_mul_ps(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+
+    _mm_storeu_ps(y, vacc${ABC[0:4]});
+    $for N in range(4, BATCH_TILE, 4):
+      _mm_storeu_ps(y + ${N}, vacc${ABC[N:N+4]});
+    y += ${BATCH_TILE};
+  }
+  $if BATCH_TILE >= 4:
+    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+      const __m128 vx0123 = _mm_loadu_ps(x);
+      x += 4;
+      __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+      vacc0123 = _mm_add_ps(vacc0123, vhalf);
+      vacc0123 = _mm_max_ps(vacc0123, vzero);
+      vacc0123 = _mm_min_ps(vacc0123, vone);
+      vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+      _mm_storeu_ps(y, vacc0123);
+      y += 4;
+    }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth);
+    vacc0123 = _mm_add_ps(vacc0123, vhalf);
+    vacc0123 = _mm_max_ps(vacc0123, vzero);
+    vacc0123 = _mm_min_ps(vacc0123, vone);
+    vacc0123 = _mm_mul_ps(vacc0123, vx0123);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vacc0123);
+      vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vacc0123);
+    }
+  }
+}
diff --git a/src/f32-hswish/wasm.c b/src/f32-hswish/wasm.c
deleted file mode 100644
index 3a0d2bb..0000000
--- a/src/f32-hswish/wasm.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/hswish.h>
-#include <xnnpack/math.h>
-
-
-void xnn_f32_hswish_ukernel__wasm(
-    size_t n,
-    const float* x,
-    float* y,
-    const union xnn_f32_hswish_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float vsixth = params->scalar.sixth;
-  const float vhalf = params->scalar.half;
-  const float vone = params->scalar.one;
-  assert(vhalf == 0.5f);
-  assert(vone == 1.0f);
-
-  do {
-    const float vx = *x++;
-
-    const float vt = __builtin_wasm_min_f32(__builtin_wasm_max_f32(vx * vsixth + vhalf, 0.0f), vone);
-    const float vy = vt * vx;
-
-    *y++ = vy;
-    n -= 4;
-  } while (n != 0);
-}
diff --git a/src/init.c b/src/init.c
index 8c4424c..044472d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -212,7 +212,7 @@
       .channel_tile = 8,
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x8;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
       .row_tile = 2,
@@ -515,7 +515,7 @@
       .channel_tile = 8,
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma_x8;
     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
@@ -862,7 +862,15 @@
     } else {
       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
     }
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse;
+    if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
+      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx512f_x32;
+    } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
+      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__fma3_x16;
+    } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
+      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx_x16;
+    } else {
+      xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse_x8;
+    }
     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_p5_div_x16;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
@@ -1093,7 +1101,7 @@
       .channel_tile = 8,
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__psimd;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__psimd_x8;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__psimd_2x8,
       .row_tile = 2,
@@ -1298,7 +1306,7 @@
       .channel_tile = 2,
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasm;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm_x4;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
       .row_tile = 4,
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index d80595c..af6d84f 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h
@@ -23,12 +23,34 @@
       float* y,                                      \
       const union xnn_f32_hswish_params* params);
 
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x8)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x8)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x8)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx_x8)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx_x16)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__fma3_x8)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__fma3_x16)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x16)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x32)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x8)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x1)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x2)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x4)
+
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar_x1)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar_x2)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar_x4)
 
 
 #ifdef __cplusplus