Additional Sigmoid micro-kernels and accuracy evaluation stub - PSIMD micro-kernels and accuracy evaluation stubs - ARM NEON micro-kernels using 2048-entry table lookups - ARM NEON micro-kernels with alternative division implementations - ARM NEON micro-kernels without FMA - x4..x24 version of all SIMD micro-kernels - Eliminated comparison with one_cutoff & corresponding blend in all micro-kernels PiperOrigin-RevId: 287804583

commit: 8d3c07e03a55862847e0a6a90f6f9177e87dba4a [log] [tgz]
author: Marat Dukhan <maratek@google.com> Thu Jan 02 01:20:59 2020 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Thu Jan 02 01:21:29 2020 -0800
tree: e328046667bbb59bdd88ce320abcf29f8857cc9a
parent: 279908a1af406a1973069979906d9fae569719fa [diff]
diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c
new file mode 100644
index 0000000..e6f0d34
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c

@@ -0,0 +1,372 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vmlaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vmlaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c
new file mode 100644
index 0000000..dd1a134
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c

@@ -0,0 +1,399 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vmlaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vmlaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c
new file mode 100644
index 0000000..c79cba6
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c

@@ -0,0 +1,426 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vmlaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vmlaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vmlaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vmlaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c
new file mode 100644
index 0000000..2d24cf9
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c

@@ -0,0 +1,453 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+    float32x4_t vnKLMN = vmlaq_f32(vmagic_bias, vzKLMN, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veKLMN = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnKLMN), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+    const uint64x2_t vidxKLMN = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnKLMN), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+    const uint64_t vidxKL = vgetq_lane_u64(vidxKLMN, 0);
+    const uint64_t vidxMN = vgetq_lane_u64(vidxKLMN, 1);
+    float32x2_t vlKL = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxKL]);
+    float32x2_t vlMN = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxMN]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+    vlKL = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxKL >> 32)], vlKL, 1);
+    vlMN = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxMN >> 32)], vlMN, 1);
+    const float32x4_t vlKLMN = vcombine_f32(vlKL, vlMN);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlKLMN), veKLMN));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vmlaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+    float32x4_t vtKLMN = vmlaq_f32(vzKLMN, vnKLMN, vln2_o2048_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+    vtKLMN = vmlaq_f32(vtKLMN, vnKLMN, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+    const float32x4_t vpKLMN = vmulq_f32(vtKLMN, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vmlaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vmlaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vmlaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vmlaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vmlaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+    const float32x4_t vyKLMN = vmlaq_f32(vsKLMN, vsKLMN, vpKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+    const float32x4_t vdKLMN = vaddq_f32(vyKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c
new file mode 100644
index 0000000..e7c0e78
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c

@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c
new file mode 100644
index 0000000..563836d
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c

@@ -0,0 +1,345 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E400p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7F7D1Cp-31f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_o2048_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vmlaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vmlaq_f32(vs4567, vs4567, vp4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c
new file mode 100644
index 0000000..cb2b088
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c

@@ -0,0 +1,309 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB);
+
+    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = vmlaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+
+    float32x4_t ve0123 = vmlaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vmlaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c
new file mode 100644
index 0000000..d8f531a
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c

@@ -0,0 +1,331 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = vmlaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+
+    float32x4_t ve0123 = vmlaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vmlaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c
new file mode 100644
index 0000000..0b8b469
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c

@@ -0,0 +1,353 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vmlaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vmlaq_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = vmlaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+
+    float32x4_t ve0123 = vmlaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vmlaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vmlaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c
new file mode 100644
index 0000000..3ae42ec
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c

@@ -0,0 +1,375 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vmlaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vmlaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vmlaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    float32x4_t vnKLMN = vmlaq_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vmlaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vmlaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vmlaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    float32x4_t vtKLMN = vmlaq_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vmlaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vmlaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vmlaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = vmlaq_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vmlaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vmlaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vmlaq_f32(vc4, vc5, vtGHIJ);
+    float32x4_t vpKLMN = vmlaq_f32(vc4, vc5, vtKLMN);
+
+    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc3, vpGHIJ, vtGHIJ);
+    vpKLMN = vmlaq_f32(vc3, vpKLMN, vtKLMN);
+
+    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc2, vpGHIJ, vtGHIJ);
+    vpKLMN = vmlaq_f32(vc2, vpKLMN, vtKLMN);
+
+    vp0123 = vmlaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vmlaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vmlaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vmlaq_f32(vc1, vpGHIJ, vtGHIJ);
+    vpKLMN = vmlaq_f32(vc1, vpKLMN, vtKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = vmulq_f32(vtKLMN, vsKLMN);
+
+    float32x4_t ve0123 = vmlaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vmlaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vmlaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vmlaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vmlaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+    float32x4_t veKLMN = vmlaq_f32(vsKLMN, vpKLMN, vtKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+    float32x4_t vdKLMN = vaddq_f32(veKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c
new file mode 100644
index 0000000..ec0ccbc
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c

@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c b/src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c
new file mode 100644
index 0000000..2f59e76
--- /dev/null
+++ b/src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c

@@ -0,0 +1,287 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vmlaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vmlaq_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vmlaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vmlaq_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = vmlaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vmlaq_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vmlaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vmlaq_f32(vc4, vc5, vt4567);
+
+    vp0123 = vmlaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc3, vp4567, vt4567);
+
+    vp0123 = vmlaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc2, vp4567, vt4567);
+
+    vp0123 = vmlaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vmlaq_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+
+    float32x4_t ve0123 = vmlaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vmlaq_f32(vs4567, vp4567, vt4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c
new file mode 100644
index 0000000..c3cd2dd
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c

@@ -0,0 +1,338 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(vy0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(vy4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(vy89AB, vd89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c
new file mode 100644
index 0000000..7621cd0
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c

@@ -0,0 +1,362 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(vy0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(vy4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(vy89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(vyCDEF, vdCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c
new file mode 100644
index 0000000..0ded1e0
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c

@@ -0,0 +1,386 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(vy0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(vy4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(vy89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(vyCDEF, vdCDEF);
+    float32x4_t vfGHIJ = vdivq_f32(vyGHIJ, vdGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c
new file mode 100644
index 0000000..8f116f9
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c

@@ -0,0 +1,410 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veKLMN = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnKLMN), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+    const uint64x2_t vidxKLMN = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnKLMN), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+    const uint64_t vidxKL = vgetq_lane_u64(vidxKLMN, 0);
+    const uint64_t vidxMN = vgetq_lane_u64(vidxKLMN, 1);
+    float32x2_t vlKL = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxKL]);
+    float32x2_t vlMN = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxMN]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+    vlKL = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxKL >> 32)], vlKL, 1);
+    vlMN = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxMN >> 32)], vlMN, 1);
+    const float32x4_t vlKLMN = vcombine_f32(vlKL, vlMN);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlKLMN), veKLMN));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+    const float32x4_t vpKLMN = vmulq_f32(vtKLMN, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+    const float32x4_t vyKLMN = vfmaq_f32(vsKLMN, vsKLMN, vpKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+    const float32x4_t vdKLMN = vaddq_f32(vyKLMN, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(vy0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(vy4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(vy89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(vyCDEF, vdCDEF);
+    float32x4_t vfGHIJ = vdivq_f32(vyGHIJ, vdGHIJ);
+    float32x4_t vfKLMN = vdivq_f32(vyKLMN, vdKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c
new file mode 100644
index 0000000..114100b
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c

@@ -0,0 +1,206 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c
new file mode 100644
index 0000000..26ef0ab
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c

@@ -0,0 +1,314 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(vy0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(vy4567, vd4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(vy, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c
new file mode 100644
index 0000000..c25ae2f
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c

@@ -0,0 +1,371 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c
new file mode 100644
index 0000000..dbb1cb6
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c

@@ -0,0 +1,398 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c
new file mode 100644
index 0000000..526bbd9
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c

@@ -0,0 +1,425 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c
new file mode 100644
index 0000000..18588a8
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c

@@ -0,0 +1,452 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veKLMN = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnKLMN), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+    const uint64x2_t vidxKLMN = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnKLMN), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+    const uint64_t vidxKL = vgetq_lane_u64(vidxKLMN, 0);
+    const uint64_t vidxMN = vgetq_lane_u64(vidxKLMN, 1);
+    float32x2_t vlKL = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxKL]);
+    float32x2_t vlMN = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxMN]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+    vlKL = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxKL >> 32)], vlKL, 1);
+    vlMN = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxMN >> 32)], vlMN, 1);
+    const float32x4_t vlKLMN = vcombine_f32(vlKL, vlMN);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlKLMN), veKLMN));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+    const float32x4_t vpKLMN = vmulq_f32(vtKLMN, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+    const float32x4_t vyKLMN = vfmaq_f32(vsKLMN, vsKLMN, vpKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+    const float32x4_t vdKLMN = vaddq_f32(vyKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c
new file mode 100644
index 0000000..efccf4e
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c

@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c
new file mode 100644
index 0000000..8c96519
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c

@@ -0,0 +1,344 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c
new file mode 100644
index 0000000..56cc409
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c

@@ -0,0 +1,371 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c
new file mode 100644
index 0000000..e72ded7
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c

@@ -0,0 +1,398 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c
new file mode 100644
index 0000000..95bd49d
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c

@@ -0,0 +1,425 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c
new file mode 100644
index 0000000..521d8e9
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c

@@ -0,0 +1,452 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veKLMN = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnKLMN), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+    const uint64x2_t vidxKLMN = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnKLMN), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+    const uint64_t vidxKL = vgetq_lane_u64(vidxKLMN, 0);
+    const uint64_t vidxMN = vgetq_lane_u64(vidxKLMN, 1);
+    float32x2_t vlKL = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxKL]);
+    float32x2_t vlMN = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxMN]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+    vlKL = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxKL >> 32)], vlKL, 1);
+    vlMN = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxMN >> 32)], vlMN, 1);
+    const float32x4_t vlKLMN = vcombine_f32(vlKL, vlMN);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlKLMN), veKLMN));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+    const float32x4_t vpKLMN = vmulq_f32(vtKLMN, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+    const float32x4_t vyKLMN = vfmaq_f32(vsKLMN, vsKLMN, vpKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+    const float32x4_t vdKLMN = vaddq_f32(vyKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c
new file mode 100644
index 0000000..2af9ee5
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c

@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c
new file mode 100644
index 0000000..cf30ace
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c

@@ -0,0 +1,344 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c
new file mode 100644
index 0000000..9b36feb
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c

@@ -0,0 +1,371 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c
new file mode 100644
index 0000000..e544536
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c

@@ -0,0 +1,398 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c
new file mode 100644
index 0000000..37121ad
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c

@@ -0,0 +1,425 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c
new file mode 100644
index 0000000..fdacea7
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c

@@ -0,0 +1,452 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e_x2048);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e_x2048);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e_x2048);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve89AB = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn89AB), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veCDEF = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnCDEF), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veGHIJ = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnGHIJ), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t veKLMN = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vnKLMN), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+    const uint64x2_t vidx89AB = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn89AB), vindex_mask));
+    const uint64x2_t vidxCDEF = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnCDEF), vindex_mask));
+    const uint64x2_t vidxGHIJ = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnGHIJ), vindex_mask));
+    const uint64x2_t vidxKLMN = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vnKLMN), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+    const uint64_t vidx89 = vgetq_lane_u64(vidx89AB, 0);
+    const uint64_t vidxAB = vgetq_lane_u64(vidx89AB, 1);
+    float32x2_t vl89 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx89]);
+    float32x2_t vlAB = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxAB]);
+    const uint64_t vidxCD = vgetq_lane_u64(vidxCDEF, 0);
+    const uint64_t vidxEF = vgetq_lane_u64(vidxCDEF, 1);
+    float32x2_t vlCD = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxCD]);
+    float32x2_t vlEF = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxEF]);
+    const uint64_t vidxGH = vgetq_lane_u64(vidxGHIJ, 0);
+    const uint64_t vidxIJ = vgetq_lane_u64(vidxGHIJ, 1);
+    float32x2_t vlGH = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxGH]);
+    float32x2_t vlIJ = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxIJ]);
+    const uint64_t vidxKL = vgetq_lane_u64(vidxKLMN, 0);
+    const uint64_t vidxMN = vgetq_lane_u64(vidxKLMN, 1);
+    float32x2_t vlKL = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxKL]);
+    float32x2_t vlMN = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidxMN]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+    vl89 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx89 >> 32)], vl89, 1);
+    vlAB = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxAB >> 32)], vlAB, 1);
+    const float32x4_t vl89AB = vcombine_f32(vl89, vlAB);
+    vlCD = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxCD >> 32)], vlCD, 1);
+    vlEF = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxEF >> 32)], vlEF, 1);
+    const float32x4_t vlCDEF = vcombine_f32(vlCD, vlEF);
+    vlGH = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxGH >> 32)], vlGH, 1);
+    vlIJ = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxIJ >> 32)], vlIJ, 1);
+    const float32x4_t vlGHIJ = vcombine_f32(vlGH, vlIJ);
+    vlKL = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxKL >> 32)], vlKL, 1);
+    vlMN = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidxMN >> 32)], vlMN, 1);
+    const float32x4_t vlKLMN = vcombine_f32(vlKL, vlMN);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl89AB), ve89AB));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlCDEF), veCDEF));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlGHIJ), veGHIJ));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vlKLMN), veKLMN));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_o2048_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_o2048_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_o2048_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_o2048_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_o2048_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_o2048_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+    const float32x4_t vp89AB = vmulq_f32(vt89AB, vc1);
+    const float32x4_t vpCDEF = vmulq_f32(vtCDEF, vc1);
+    const float32x4_t vpGHIJ = vmulq_f32(vtGHIJ, vc1);
+    const float32x4_t vpKLMN = vmulq_f32(vtKLMN, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+    const float32x4_t vy89AB = vfmaq_f32(vs89AB, vs89AB, vp89AB);
+    const float32x4_t vyCDEF = vfmaq_f32(vsCDEF, vsCDEF, vpCDEF);
+    const float32x4_t vyGHIJ = vfmaq_f32(vsGHIJ, vsGHIJ, vpGHIJ);
+    const float32x4_t vyKLMN = vfmaq_f32(vsKLMN, vsKLMN, vpKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+    const float32x4_t vd89AB = vaddq_f32(vy89AB, vone);
+    const float32x4_t vdCDEF = vaddq_f32(vyCDEF, vone);
+    const float32x4_t vdGHIJ = vaddq_f32(vyGHIJ, vone);
+    const float32x4_t vdKLMN = vaddq_f32(vyKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(vy89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(vyCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(vyGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(vyKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_s32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_s32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_s32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c
new file mode 100644
index 0000000..6e4d45a
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c

@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c
new file mode 100644
index 0000000..e952674
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c

@@ -0,0 +1,344 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-lut2048-p1.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e_x2048);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve0123 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn0123), vmovq_n_s32(INT32_C(0x7FF))), 12);
+    const int32x4_t ve4567 = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn4567), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx0123 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn0123), vindex_mask));
+    const uint64x2_t vidx4567 = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn4567), vindex_mask));
+
+    const uint64_t vidx01 = vgetq_lane_u64(vidx0123, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx0123, 1);
+    float32x2_t vl01 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx23]);
+    const uint64_t vidx45 = vgetq_lane_u64(vidx4567, 0);
+    const uint64_t vidx67 = vgetq_lane_u64(vidx4567, 1);
+    float32x2_t vl45 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx45]);
+    float32x2_t vl67 = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx67]);
+
+    vl01 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl0123 = vcombine_f32(vl01, vl23);
+    vl45 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx45 >> 32)], vl45, 1);
+    vl67 = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx67 >> 32)], vl67, 1);
+    const float32x4_t vl4567 = vcombine_f32(vl45, vl67);
+
+    // Adjust exponent of the value l fetched from the table to get the final s value.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl0123), ve0123));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl4567), ve4567));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_o2048_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_o2048_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_o2048_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp0123 = vmulq_f32(vt0123, vc1);
+    const float32x4_t vp4567 = vmulq_f32(vt4567, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy0123 = vfmaq_f32(vs0123, vs0123, vp0123);
+    const float32x4_t vy4567 = vfmaq_f32(vs4567, vs4567, vp4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd0123 = vaddq_f32(vy0123, vone);
+    const float32x4_t vd4567 = vaddq_f32(vy4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(vy0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(vy4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_s32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_s32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vfmaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vfmaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x12.c b/src/f32-sigmoid/gen/neonfma-p5-div-x12.c
new file mode 100644
index 0000000..cd43f82
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x12.c

@@ -0,0 +1,275 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(ve0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(ve4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(ve89AB, vd89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x16.c b/src/f32-sigmoid/gen/neonfma-p5-div-x16.c
new file mode 100644
index 0000000..c524a47
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x16.c

@@ -0,0 +1,294 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(ve0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(ve4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(ve89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(veCDEF, vdCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x20.c b/src/f32-sigmoid/gen/neonfma-p5-div-x20.c
new file mode 100644
index 0000000..983a4d4
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x20.c

@@ -0,0 +1,313 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(ve0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(ve4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(ve89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(veCDEF, vdCDEF);
+    float32x4_t vfGHIJ = vdivq_f32(veGHIJ, vdGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x24.c b/src/f32-sigmoid/gen/neonfma-p5-div-x24.c
new file mode 100644
index 0000000..89b5d06
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x24.c

@@ -0,0 +1,332 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+    float32x4_t vpKLMN = vfmaq_f32(vc4, vc5, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc3, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc2, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc1, vpKLMN, vtKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = vmulq_f32(vtKLMN, vsKLMN);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+    float32x4_t veKLMN = vfmaq_f32(vsKLMN, vpKLMN, vtKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+    float32x4_t vdKLMN = vaddq_f32(veKLMN, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(ve0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(ve4567, vd4567);
+    float32x4_t vf89AB = vdivq_f32(ve89AB, vd89AB);
+    float32x4_t vfCDEF = vdivq_f32(veCDEF, vdCDEF);
+    float32x4_t vfGHIJ = vdivq_f32(veGHIJ, vdGHIJ);
+    float32x4_t vfKLMN = vdivq_f32(veKLMN, vdKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x4.c b/src/f32-sigmoid/gen/neonfma-p5-div-x4.c
new file mode 100644
index 0000000..b197fc8
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x4.c

@@ -0,0 +1,170 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-div-x8.c b/src/f32-sigmoid/gen/neonfma-p5-div-x8.c
new file mode 100644
index 0000000..3fd3629
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-div-x8.c

@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_div_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vdivq_f32(ve0123, vd0123);
+    float32x4_t vf4567 = vdivq_f32(ve4567, vd4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vdivq_f32(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c
new file mode 100644
index 0000000..f4a11c6
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c

@@ -0,0 +1,308 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c
new file mode 100644
index 0000000..dfb9151
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c

@@ -0,0 +1,330 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c
new file mode 100644
index 0000000..4e85372
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c

@@ -0,0 +1,352 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c
new file mode 100644
index 0000000..baf08fa
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c

@@ -0,0 +1,374 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+    float32x4_t vpKLMN = vfmaq_f32(vc4, vc5, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc3, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc2, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc1, vpKLMN, vtKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = vmulq_f32(vtKLMN, vsKLMN);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+    float32x4_t veKLMN = vfmaq_f32(vsKLMN, vpKLMN, vtKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+    float32x4_t vdKLMN = vaddq_f32(veKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c
new file mode 100644
index 0000000..26d6c2c
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c

@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c
new file mode 100644
index 0000000..69d05dc
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c

@@ -0,0 +1,286 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c
new file mode 100644
index 0000000..f6d0143
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c

@@ -0,0 +1,308 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
index 0c8ac1b..9782f1f 100644
--- a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c

@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-sigmoid/neonfma-p5-nr2fma.c.in
+//   Template: src/f32-sigmoid/neon-p5.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
@@ -24,11 +24,9 @@
   assert(n % sizeof(float) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -115,7 +113,7 @@
     vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
     vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
 
-    // Reconstruct the exp(z) value:
+    // Reconstruct the exp(-z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
@@ -129,14 +127,14 @@
     float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
     float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
 
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
     float32x4_t vd0123 = vaddq_f32(ve0123, vone);
     float32x4_t vd4567 = vaddq_f32(ve4567, vone);
     float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
     float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
 
     // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
     // Thus the reciprocal of the denominator never overflows.
     float32x4_t vr0123 = vrecpeq_f32(vd0123);
     float32x4_t vr4567 = vrecpeq_f32(vd4567);
@@ -153,44 +151,37 @@
     vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
     vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
 
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
     float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
     float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
     float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
 
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    const uint32x4_t vm0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
-    const uint32x4_t vm4567 = vcltq_s32(vreinterpretq_s32_f32(vx4567), vmovq_n_s32(0));
-    const uint32x4_t vm89AB = vcltq_s32(vreinterpretq_s32_f32(vx89AB), vmovq_n_s32(0));
-    const uint32x4_t vmCDEF = vcltq_s32(vreinterpretq_s32_f32(vxCDEF), vmovq_n_s32(0));
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
 
     vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
     vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
     vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
     vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vbslq_f32(vcgtq_f32(vx0123, vone_cutoff), vone, vf0123);
-    vf4567 = vbslq_f32(vcgtq_f32(vx4567, vone_cutoff), vone, vf4567);
-    vf89AB = vbslq_f32(vcgtq_f32(vx89AB, vone_cutoff), vone, vf89AB);
-    vfCDEF = vbslq_f32(vcgtq_f32(vxCDEF, vone_cutoff), vone, vfCDEF);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
-    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcltq_f32(vx4567, vdenorm_cutoff)));
-    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcltq_f32(vx89AB, vdenorm_cutoff)));
-    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcltq_f32(vxCDEF, vdenorm_cutoff)));
-
     vst1q_f32(y, vf0123); y += 4;
     vst1q_f32(y, vf4567); y += 4;
     vst1q_f32(y, vf89AB); y += 4;
     vst1q_f32(y, vfCDEF); y += 4;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx = vld1q_f32(x); x += 4;
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -199,7 +190,7 @@
     //
     // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
     // then replace result with 1 - f[z] if x <= 0.
-    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz = vabsq_f32(vx);
 
     // Compute reduced argument n := round(-z / log(2)).
     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
@@ -207,62 +198,60 @@
     // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
     // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
     // anyway. We fixup the result for such inputs at the very end of the algorithm.
-    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
-    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
 
     // Subtract the large number back to get final n := round(-z / log(2)).
-    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn = vsubq_f32(vn, vmagic_bias);
 
     // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
-    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
 
     // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
-    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
-    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
 
-    // Reconstruct the exp(z) value:
+    // Reconstruct the exp(-z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = vmulq_f32(vt0123, vs0123);
-    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
 
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
 
     // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
     // Thus the reciprocal of the denominator never overflows.
-    float32x4_t vr0123 = vrecpeq_f32(vd0123);
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    float32x4_t vr = vrecpeq_f32(vd);
 
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
 
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    const uint32x4_t vm0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
-    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vbslq_f32(vcgtq_f32(vx0123, vone_cutoff), vone, vf0123);
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
 
-    vst1q_f32(y, vf0123); y += 4;
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
+    const float32x4_t vx = vld1q_f32(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -271,7 +260,7 @@
     //
     // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
     // then replace result with 1 - f[z] if x <= 0.
-    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz = vabsq_f32(vx);
 
     // Compute reduced argument n := round(-z / log(2)).
     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
@@ -279,65 +268,63 @@
     // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
     // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
     // anyway. We fixup the result for such inputs at the very end of the algorithm.
-    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
-    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
 
     // Subtract the large number back to get final n := round(-z / log(2)).
-    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn = vsubq_f32(vn, vmagic_bias);
 
     // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
-    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
 
     // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
-    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
-    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
 
-    // Reconstruct the exp(z) value:
+    // Reconstruct the exp(-z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = vmulq_f32(vt0123, vs0123);
-    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
 
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
 
     // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
     // Thus the reciprocal of the denominator never overflows.
-    float32x4_t vr0123 = vrecpeq_f32(vd0123);
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    float32x4_t vr = vrecpeq_f32(vd);
 
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
 
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    const uint32x4_t vm0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
-    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vbslq_f32(vcgtq_f32(vx0123, vone_cutoff), vone, vf0123);
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
 
-    float32x2_t vf01 = vget_low_f32(vf0123);
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
     if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vf01); y += 2;
-      vf01 = vget_high_f32(vf0123);
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
     }
     if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vf01, 0);
+      vst1_lane_f32(y, vf_lo, 0);
     }
   }
 }

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c
new file mode 100644
index 0000000..40e5cb7
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c

@@ -0,0 +1,352 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c
new file mode 100644
index 0000000..175b3f8
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c

@@ -0,0 +1,374 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+    float32x4_t vpKLMN = vfmaq_f32(vc4, vc5, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc3, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc2, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc1, vpKLMN, vtKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = vmulq_f32(vtKLMN, vsKLMN);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+    float32x4_t veKLMN = vfmaq_f32(vsKLMN, vpKLMN, vtKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+    float32x4_t vdKLMN = vaddq_f32(veKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+    vr89AB = vfmaq_f32(vr89AB, vr89AB, vfmsq_f32(vone, vr89AB, vd89AB));
+    vrCDEF = vfmaq_f32(vrCDEF, vrCDEF, vfmsq_f32(vone, vrCDEF, vdCDEF));
+    vrGHIJ = vfmaq_f32(vrGHIJ, vrGHIJ, vfmsq_f32(vone, vrGHIJ, vdGHIJ));
+    vrKLMN = vfmaq_f32(vrKLMN, vrKLMN, vfmsq_f32(vone, vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c
new file mode 100644
index 0000000..f4a8d4c
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c

@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c
new file mode 100644
index 0000000..f7802e1
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c

@@ -0,0 +1,286 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
+    vr4567 = vfmaq_f32(vr4567, vr4567, vfmsq_f32(vone, vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c
new file mode 100644
index 0000000..bf0dc12
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c

@@ -0,0 +1,308 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c
new file mode 100644
index 0000000..6bf74d9
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c

@@ -0,0 +1,330 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c
new file mode 100644
index 0000000..69b9211
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c

@@ -0,0 +1,352 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c
new file mode 100644
index 0000000..8af8a9b
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c

@@ -0,0 +1,374 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+    const float32x4_t vx89AB = vld1q_f32(x); x += 4;
+    const float32x4_t vxCDEF = vld1q_f32(x); x += 4;
+    const float32x4_t vxGHIJ = vld1q_f32(x); x += 4;
+    const float32x4_t vxKLMN = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+    const float32x4_t vz89AB = vabsq_f32(vx89AB);
+    const float32x4_t vzCDEF = vabsq_f32(vxCDEF);
+    const float32x4_t vzGHIJ = vabsq_f32(vxGHIJ);
+    const float32x4_t vzKLMN = vabsq_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+    float32x4_t vn89AB = vfmaq_f32(vmagic_bias, vz89AB, vminus_log2e);
+    float32x4_t vnCDEF = vfmaq_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    float32x4_t vnGHIJ = vfmaq_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    float32x4_t vnKLMN = vfmaq_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+    const float32x4_t vs89AB = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn89AB), 23));
+    const float32x4_t vsCDEF = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnCDEF), 23));
+    const float32x4_t vsGHIJ = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnGHIJ), 23));
+    const float32x4_t vsKLMN = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+    vn89AB = vsubq_f32(vn89AB, vmagic_bias);
+    vnCDEF = vsubq_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = vsubq_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = vsubq_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+    float32x4_t vt89AB = vfmaq_f32(vz89AB, vn89AB, vln2_hi);
+    float32x4_t vtCDEF = vfmaq_f32(vzCDEF, vnCDEF, vln2_hi);
+    float32x4_t vtGHIJ = vfmaq_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    float32x4_t vtKLMN = vfmaq_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = vfmaq_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = vfmaq_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = vfmaq_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = vfmaq_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+    float32x4_t vp89AB = vfmaq_f32(vc4, vc5, vt89AB);
+    float32x4_t vpCDEF = vfmaq_f32(vc4, vc5, vtCDEF);
+    float32x4_t vpGHIJ = vfmaq_f32(vc4, vc5, vtGHIJ);
+    float32x4_t vpKLMN = vfmaq_f32(vc4, vc5, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc3, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc3, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc2, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc2, vpKLMN, vtKLMN);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+    vp89AB = vfmaq_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = vfmaq_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = vfmaq_f32(vc1, vpGHIJ, vtGHIJ);
+    vpKLMN = vfmaq_f32(vc1, vpKLMN, vtKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+    vt89AB = vmulq_f32(vt89AB, vs89AB);
+    vtCDEF = vmulq_f32(vtCDEF, vsCDEF);
+    vtGHIJ = vmulq_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = vmulq_f32(vtKLMN, vsKLMN);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+    float32x4_t ve89AB = vfmaq_f32(vs89AB, vp89AB, vt89AB);
+    float32x4_t veCDEF = vfmaq_f32(vsCDEF, vpCDEF, vtCDEF);
+    float32x4_t veGHIJ = vfmaq_f32(vsGHIJ, vpGHIJ, vtGHIJ);
+    float32x4_t veKLMN = vfmaq_f32(vsKLMN, vpKLMN, vtKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+    float32x4_t vd89AB = vaddq_f32(ve89AB, vone);
+    float32x4_t vdCDEF = vaddq_f32(veCDEF, vone);
+    float32x4_t vdGHIJ = vaddq_f32(veGHIJ, vone);
+    float32x4_t vdKLMN = vaddq_f32(veKLMN, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+    float32x4_t vr89AB = vrecpeq_f32(vd89AB);
+    float32x4_t vrCDEF = vrecpeq_f32(vdCDEF);
+    float32x4_t vrGHIJ = vrecpeq_f32(vdGHIJ);
+    float32x4_t vrKLMN = vrecpeq_f32(vdKLMN);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+    vr89AB = vmulq_f32(vr89AB, vrecpsq_f32(vr89AB, vd89AB));
+    vrCDEF = vmulq_f32(vrCDEF, vrecpsq_f32(vrCDEF, vdCDEF));
+    vrGHIJ = vmulq_f32(vrGHIJ, vrecpsq_f32(vrGHIJ, vdGHIJ));
+    vrKLMN = vmulq_f32(vrKLMN, vrecpsq_f32(vrKLMN, vdKLMN));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+    float32x4_t vf89AB = vmulq_f32(ve89AB, vr89AB);
+    float32x4_t vfCDEF = vmulq_f32(veCDEF, vrCDEF);
+    float32x4_t vfGHIJ = vmulq_f32(veGHIJ, vrGHIJ);
+    float32x4_t vfKLMN = vmulq_f32(veKLMN, vrKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+    vf89AB = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf89AB), vcagtq_f32(vx89AB, vdenorm_cutoff)));
+    vfCDEF = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfCDEF), vcagtq_f32(vxCDEF, vdenorm_cutoff)));
+    vfGHIJ = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfGHIJ), vcagtq_f32(vxGHIJ, vdenorm_cutoff)));
+    vfKLMN = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vfKLMN), vcagtq_f32(vxKLMN, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+    const uint32x4_t vm89AB = vcltq_f32(vx89AB, vmovq_n_f32(0.0f));
+    const uint32x4_t vmCDEF = vcltq_f32(vxCDEF, vmovq_n_f32(0.0f));
+    const uint32x4_t vmGHIJ = vcltq_f32(vxGHIJ, vmovq_n_f32(0.0f));
+    const uint32x4_t vmKLMN = vcltq_f32(vxKLMN, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+    vf89AB = vbslq_f32(vm89AB, vf89AB, vsubq_f32(vone, vf89AB));
+    vfCDEF = vbslq_f32(vmCDEF, vfCDEF, vsubq_f32(vone, vfCDEF));
+    vfGHIJ = vbslq_f32(vmGHIJ, vfGHIJ, vsubq_f32(vone, vfGHIJ));
+    vfKLMN = vbslq_f32(vmKLMN, vfKLMN, vsubq_f32(vone, vfKLMN));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+    vst1q_f32(y, vf89AB); y += 4;
+    vst1q_f32(y, vfCDEF); y += 4;
+    vst1q_f32(y, vfGHIJ); y += 4;
+    vst1q_f32(y, vfKLMN); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c
new file mode 100644
index 0000000..b19b444
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c

@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c
new file mode 100644
index 0000000..5033b9d
--- /dev/null
+++ b/src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c

@@ -0,0 +1,286 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/neon-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
+    const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const float32x4_t vz0123 = vabsq_f32(vx0123);
+    const float32x4_t vz4567 = vabsq_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
+    float32x4_t vn4567 = vfmaq_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
+    const float32x4_t vs4567 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn4567), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn0123 = vsubq_f32(vn0123, vmagic_bias);
+    vn4567 = vsubq_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
+    float32x4_t vt4567 = vfmaq_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = vfmaq_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
+    float32x4_t vp4567 = vfmaq_f32(vc4, vc5, vt4567);
+
+    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc3, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc2, vp4567, vt4567);
+
+    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
+    vp4567 = vfmaq_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = vmulq_f32(vt0123, vs0123);
+    vt4567 = vmulq_f32(vt4567, vs4567);
+
+    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
+    float32x4_t ve4567 = vfmaq_f32(vs4567, vp4567, vt4567);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
+    float32x4_t vd4567 = vaddq_f32(ve4567, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr0123 = vrecpeq_f32(vd0123);
+    float32x4_t vr4567 = vrecpeq_f32(vd4567);
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    vr0123 = vmulq_f32(vr0123, vrecpsq_f32(vr0123, vd0123));
+    vr4567 = vmulq_f32(vr4567, vrecpsq_f32(vr4567, vd4567));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
+    float32x4_t vf4567 = vmulq_f32(ve4567, vr4567);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcagtq_f32(vx0123, vdenorm_cutoff)));
+    vf4567 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf4567), vcagtq_f32(vx4567, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm0123 = vcltq_f32(vx0123, vmovq_n_f32(0.0f));
+    const uint32x4_t vm4567 = vcltq_f32(vx4567, vmovq_n_f32(0.0f));
+
+    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
+    vf4567 = vbslq_f32(vm4567, vf4567, vsubq_f32(vone, vf4567));
+
+    vst1q_f32(y, vf0123); y += 4;
+    vst1q_f32(y, vf4567); y += 4;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vfmaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
+    vt = vfmaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
+    vp = vfmaq_f32(vc3, vp, vt);
+    vp = vfmaq_f32(vc2, vp, vt);
+    vp = vfmaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vfmaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x12.c b/src/f32-sigmoid/gen/psimd-p5-div-x12.c
new file mode 100644
index 0000000..4a09311
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x12.c

@@ -0,0 +1,266 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    const psimd_f32 vx89AB = psimd_load_f32(x + 8);
+    x += 12;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
+    const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
+    const psimd_f32 vz89AB = psimd_abs_f32(vx89AB);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vz89AB, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vz89AB, vn89AB, vln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vt89AB, vc5);
+
+    vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc3, vt89AB, vp89AB);
+
+    vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc2, vt89AB, vp89AB);
+
+    vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc1, vt89AB, vp89AB);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+    const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    const psimd_f32 ve89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
+    psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
+    psimd_f32 vf89AB = psimd_div_f32(ve89AB, psimd_add_f32(ve89AB, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vz89AB > vdenorm_cutoff, vf89AB);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
+    vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
+    vf89AB = psimd_signblend_f32(vx89AB, vf89AB, psimd_sub_f32(vone, vf89AB));
+
+    psimd_store_f32(y, vf0123);
+    psimd_store_f32(y + 4, vf4567);
+    psimd_store_f32(y + 8, vf89AB);
+    y += 12;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x16.c b/src/f32-sigmoid/gen/psimd-p5-div-x16.c
new file mode 100644
index 0000000..fe5a19d
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x16.c

@@ -0,0 +1,283 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x16(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    const psimd_f32 vx89AB = psimd_load_f32(x + 8);
+    const psimd_f32 vxCDEF = psimd_load_f32(x + 12);
+    x += 16;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
+    const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
+    const psimd_f32 vz89AB = psimd_abs_f32(vx89AB);
+    const psimd_f32 vzCDEF = psimd_abs_f32(vxCDEF);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vz89AB, vminus_log2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vzCDEF, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vz89AB, vn89AB, vln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vzCDEF, vnCDEF, vln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vt89AB, vc5);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vtCDEF, vc5);
+
+    vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc3, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vtCDEF, vpCDEF);
+
+    vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc2, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vtCDEF, vpCDEF);
+
+    vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc1, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vtCDEF, vpCDEF);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+    const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    const psimd_f32 ve89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    const psimd_f32 veCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
+    psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
+    psimd_f32 vf89AB = psimd_div_f32(ve89AB, psimd_add_f32(ve89AB, vone));
+    psimd_f32 vfCDEF = psimd_div_f32(veCDEF, psimd_add_f32(veCDEF, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vz89AB > vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vzCDEF > vdenorm_cutoff, vfCDEF);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
+    vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
+    vf89AB = psimd_signblend_f32(vx89AB, vf89AB, psimd_sub_f32(vone, vf89AB));
+    vfCDEF = psimd_signblend_f32(vxCDEF, vfCDEF, psimd_sub_f32(vone, vfCDEF));
+
+    psimd_store_f32(y, vf0123);
+    psimd_store_f32(y + 4, vf4567);
+    psimd_store_f32(y + 8, vf89AB);
+    psimd_store_f32(y + 12, vfCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x20.c b/src/f32-sigmoid/gen/psimd-p5-div-x20.c
new file mode 100644
index 0000000..81aa294
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x20.c

@@ -0,0 +1,300 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    const psimd_f32 vx89AB = psimd_load_f32(x + 8);
+    const psimd_f32 vxCDEF = psimd_load_f32(x + 12);
+    const psimd_f32 vxGHIJ = psimd_load_f32(x + 16);
+    x += 20;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
+    const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
+    const psimd_f32 vz89AB = psimd_abs_f32(vx89AB);
+    const psimd_f32 vzCDEF = psimd_abs_f32(vxCDEF);
+    const psimd_f32 vzGHIJ = psimd_abs_f32(vxGHIJ);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vz89AB, vminus_log2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+    const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vz89AB, vn89AB, vln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vzCDEF, vnCDEF, vln2_hi);
+    psimd_f32 vtGHIJ = psimd_qfma_f32(vzGHIJ, vnGHIJ, vln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vt89AB, vc5);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vtCDEF, vc5);
+    psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vtGHIJ, vc5);
+
+    vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc3, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc3, vtGHIJ, vpGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc2, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc2, vtGHIJ, vpGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc1, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc1, vtGHIJ, vpGHIJ);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+    vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+    const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    const psimd_f32 ve89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    const psimd_f32 veCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+    const psimd_f32 veGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
+    psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
+    psimd_f32 vf89AB = psimd_div_f32(ve89AB, psimd_add_f32(ve89AB, vone));
+    psimd_f32 vfCDEF = psimd_div_f32(veCDEF, psimd_add_f32(veCDEF, vone));
+    psimd_f32 vfGHIJ = psimd_div_f32(veGHIJ, psimd_add_f32(veGHIJ, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vz89AB > vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vzCDEF > vdenorm_cutoff, vfCDEF);
+    vfGHIJ = psimd_andnotmask_f32(vzGHIJ > vdenorm_cutoff, vfGHIJ);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
+    vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
+    vf89AB = psimd_signblend_f32(vx89AB, vf89AB, psimd_sub_f32(vone, vf89AB));
+    vfCDEF = psimd_signblend_f32(vxCDEF, vfCDEF, psimd_sub_f32(vone, vfCDEF));
+    vfGHIJ = psimd_signblend_f32(vxGHIJ, vfGHIJ, psimd_sub_f32(vone, vfGHIJ));
+
+    psimd_store_f32(y, vf0123);
+    psimd_store_f32(y + 4, vf4567);
+    psimd_store_f32(y + 8, vf89AB);
+    psimd_store_f32(y + 12, vfCDEF);
+    psimd_store_f32(y + 16, vfGHIJ);
+    y += 20;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x24.c b/src/f32-sigmoid/gen/psimd-p5-div-x24.c
new file mode 100644
index 0000000..22b9fb2
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x24.c

@@ -0,0 +1,317 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    const psimd_f32 vx89AB = psimd_load_f32(x + 8);
+    const psimd_f32 vxCDEF = psimd_load_f32(x + 12);
+    const psimd_f32 vxGHIJ = psimd_load_f32(x + 16);
+    const psimd_f32 vxKLMN = psimd_load_f32(x + 20);
+    x += 24;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
+    const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
+    const psimd_f32 vz89AB = psimd_abs_f32(vx89AB);
+    const psimd_f32 vzCDEF = psimd_abs_f32(vxCDEF);
+    const psimd_f32 vzGHIJ = psimd_abs_f32(vxGHIJ);
+    const psimd_f32 vzKLMN = psimd_abs_f32(vxKLMN);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vz89AB, vminus_log2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vzCDEF, vminus_log2e);
+    psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vzGHIJ, vminus_log2e);
+    psimd_f32 vnKLMN = psimd_qfma_f32(vmagic_bias, vzKLMN, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+    const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+    const psimd_f32 vsKLMN = (psimd_f32) ((psimd_u32) vnKLMN << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+    vnKLMN = psimd_sub_f32(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vz89AB, vn89AB, vln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vzCDEF, vnCDEF, vln2_hi);
+    psimd_f32 vtGHIJ = psimd_qfma_f32(vzGHIJ, vnGHIJ, vln2_hi);
+    psimd_f32 vtKLMN = psimd_qfma_f32(vzKLMN, vnKLMN, vln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vln2_lo);
+    vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vln2_lo);
+    vtKLMN = psimd_qfma_f32(vtKLMN, vnKLMN, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vt89AB, vc5);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vtCDEF, vc5);
+    psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vtGHIJ, vc5);
+    psimd_f32 vpKLMN = psimd_qfma_f32(vc4, vtKLMN, vc5);
+
+    vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc3, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc3, vtGHIJ, vpGHIJ);
+    vpKLMN = psimd_qfma_f32(vc3, vtKLMN, vpKLMN);
+
+    vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc2, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc2, vtGHIJ, vpGHIJ);
+    vpKLMN = psimd_qfma_f32(vc2, vtKLMN, vpKLMN);
+
+    vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
+    vp89AB = psimd_qfma_f32(vc1, vt89AB, vp89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vtCDEF, vpCDEF);
+    vpGHIJ = psimd_qfma_f32(vc1, vtGHIJ, vpGHIJ);
+    vpKLMN = psimd_qfma_f32(vc1, vtKLMN, vpKLMN);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+    vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+    vtKLMN = psimd_mul_f32(vtKLMN, vsKLMN);
+
+    const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    const psimd_f32 ve89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    const psimd_f32 veCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+    const psimd_f32 veGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+    const psimd_f32 veKLMN = psimd_qfma_f32(vsKLMN, vtKLMN, vpKLMN);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
+    psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
+    psimd_f32 vf89AB = psimd_div_f32(ve89AB, psimd_add_f32(ve89AB, vone));
+    psimd_f32 vfCDEF = psimd_div_f32(veCDEF, psimd_add_f32(veCDEF, vone));
+    psimd_f32 vfGHIJ = psimd_div_f32(veGHIJ, psimd_add_f32(veGHIJ, vone));
+    psimd_f32 vfKLMN = psimd_div_f32(veKLMN, psimd_add_f32(veKLMN, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vz89AB > vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vzCDEF > vdenorm_cutoff, vfCDEF);
+    vfGHIJ = psimd_andnotmask_f32(vzGHIJ > vdenorm_cutoff, vfGHIJ);
+    vfKLMN = psimd_andnotmask_f32(vzKLMN > vdenorm_cutoff, vfKLMN);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
+    vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
+    vf89AB = psimd_signblend_f32(vx89AB, vf89AB, psimd_sub_f32(vone, vf89AB));
+    vfCDEF = psimd_signblend_f32(vxCDEF, vfCDEF, psimd_sub_f32(vone, vfCDEF));
+    vfGHIJ = psimd_signblend_f32(vxGHIJ, vfGHIJ, psimd_sub_f32(vone, vfGHIJ));
+    vfKLMN = psimd_signblend_f32(vxKLMN, vfKLMN, psimd_sub_f32(vone, vfKLMN));
+
+    psimd_store_f32(y, vf0123);
+    psimd_store_f32(y + 4, vf4567);
+    psimd_store_f32(y + 8, vf89AB);
+    psimd_store_f32(y + 12, vfCDEF);
+    psimd_store_f32(y + 16, vfGHIJ);
+    psimd_store_f32(y + 20, vfKLMN);
+    y += 24;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x4.c b/src/f32-sigmoid/gen/psimd-p5-div-x4.c
new file mode 100644
index 0000000..148d703
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x4.c

@@ -0,0 +1,167 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/psimd-p5-div-x8.c b/src/f32-sigmoid/gen/psimd-p5-div-x8.c
new file mode 100644
index 0000000..c20c49b
--- /dev/null
+++ b/src/f32-sigmoid/gen/psimd-p5-div-x8.c

@@ -0,0 +1,249 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/psimd-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x8(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const psimd_f32 vx0123 = psimd_load_f32(x);
+    const psimd_f32 vx4567 = psimd_load_f32(x + 4);
+    x += 8;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
+    const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
+
+    vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
+
+    vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
+
+    vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
+    vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+
+    const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
+    psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
+    vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
+
+    psimd_store_f32(y, vf0123);
+    psimd_store_f32(y + 4, vf4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x1.c b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x1.c
index c398b0c..c96c573 100644
--- a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x1.c
+++ b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x1.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x2048 = -0x1.715476p11f;
   // Last 18 bits are zeroes
   const float vln2_o2048_hi = 0x1.600000p-12f;
@@ -102,23 +100,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y++ = vf;
 
     n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x2.c b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x2.c
index abe7b0d..f69dc21 100644
--- a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x2.c
+++ b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x2.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x2048 = -0x1.715476p11f;
   // Last 18 bits are zeroes
   const float vln2_o2048_hi = 0x1.600000p-12f;
@@ -116,6 +114,15 @@
     float vf0 = vy0 / (vy0 + vone);
     float vf1 = vy1 / (vy1 + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -124,24 +131,6 @@
       vf1 = vone - vf1;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y += 2;
@@ -205,23 +194,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y = vf;
   }
 }

diff --git a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x4.c b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x4.c
index 2bfd5ca..51a6bcd 100644
--- a/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x4.c
+++ b/src/f32-sigmoid/gen/scalar-lut2048-p1-div-x4.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x2048 = -0x1.715476p11f;
   // Last 18 bits are zeroes
   const float vln2_o2048_hi = 0x1.600000p-12f;
@@ -140,6 +138,21 @@
     float vf2 = vy2 / (vy2 + vone);
     float vf3 = vy3 / (vy3 + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz2 > vdenorm_cutoff) {
+      vf2 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz3 > vdenorm_cutoff) {
+      vf3 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -154,36 +167,6 @@
       vf3 = vone - vf3;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx2 > vone_cutoff) {
-      vf2 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx3 > vone_cutoff) {
-      vf3 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) {
-      vf2 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) {
-      vf3 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y[2] = vf2;
@@ -250,23 +233,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x1.c b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x1.c
index 8cb7934..8efdcf0 100644
--- a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x1.c
+++ b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x1.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x64 = -0x1.715476p6f;
   // Last 13 bits are zeroes
   const float vln2_o64_hi =  0x1.630000p-7f;
@@ -104,23 +102,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y++ = vf;
 
     n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x2.c b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x2.c
index 5c3b748..ebe9491 100644
--- a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x2.c
+++ b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x2.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x64 = -0x1.715476p6f;
   // Last 13 bits are zeroes
   const float vln2_o64_hi =  0x1.630000p-7f;
@@ -120,6 +118,15 @@
     float vf0 = vy0 / (vy0 + vone);
     float vf1 = vy1 / (vy1 + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -128,24 +135,6 @@
       vf1 = vone - vf1;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y += 2;
@@ -211,23 +200,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y = vf;
   }
 }

diff --git a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x4.c b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x4.c
index 32c832c..ffadc34 100644
--- a/src/f32-sigmoid/gen/scalar-lut64-p2-div-x4.c
+++ b/src/f32-sigmoid/gen/scalar-lut64-p2-div-x4.c

@@ -28,11 +28,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x64 = -0x1.715476p6f;
   // Last 13 bits are zeroes
   const float vln2_o64_hi =  0x1.630000p-7f;
@@ -146,6 +144,21 @@
     float vf2 = vy2 / (vy2 + vone);
     float vf3 = vy3 / (vy3 + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz2 > vdenorm_cutoff) {
+      vf2 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz3 > vdenorm_cutoff) {
+      vf3 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -160,36 +173,6 @@
       vf3 = vone - vf3;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx2 > vone_cutoff) {
-      vf2 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx3 > vone_cutoff) {
-      vf3 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) {
-      vf2 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) {
-      vf3 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y[2] = vf2;
@@ -258,23 +241,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/scalar-p5-div-x1.c b/src/f32-sigmoid/gen/scalar-p5-div-x1.c
index f39e711..7ee9b94 100644
--- a/src/f32-sigmoid/gen/scalar-p5-div-x1.c
+++ b/src/f32-sigmoid/gen/scalar-p5-div-x1.c

@@ -25,11 +25,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.8000FEp23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e = -0x1.715476p+0f;
   // Last 7 bits are zeroes
   const float vln2_hi = 0x1.62E400p-1f;
@@ -91,23 +89,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = ve / (ve + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y++ = vf;
 
     n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/scalar-p5-div-x2.c b/src/f32-sigmoid/gen/scalar-p5-div-x2.c
index 288d4de..0aa9fd6 100644
--- a/src/f32-sigmoid/gen/scalar-p5-div-x2.c
+++ b/src/f32-sigmoid/gen/scalar-p5-div-x2.c

@@ -25,11 +25,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.8000FEp23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e = -0x1.715476p+0f;
   // Last 7 bits are zeroes
   const float vln2_hi = 0x1.62E400p-1f;
@@ -111,6 +109,15 @@
     float vf0 = ve0 / (ve0 + vone);
     float vf1 = ve1 / (ve1 + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -119,24 +126,6 @@
       vf1 = vone - vf1;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y += 2;
@@ -190,23 +179,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = ve / (ve + vone);
 
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *y = vf;
   }
 }

diff --git a/src/f32-sigmoid/gen/scalar-p5-div-x4.c b/src/f32-sigmoid/gen/scalar-p5-div-x4.c
index 0b148bc..a51ac1f 100644
--- a/src/f32-sigmoid/gen/scalar-p5-div-x4.c
+++ b/src/f32-sigmoid/gen/scalar-p5-div-x4.c

@@ -25,11 +25,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.8000FEp23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e = -0x1.715476p+0f;
   // Last 7 bits are zeroes
   const float vln2_hi = 0x1.62E400p-1f;
@@ -139,6 +137,21 @@
     float vf2 = ve2 / (ve2 + vone);
     float vf3 = ve3 / (ve3 + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz0 > vdenorm_cutoff) {
+      vf0 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz1 > vdenorm_cutoff) {
+      vf1 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz2 > vdenorm_cutoff) {
+      vf2 = 0.0f;
+    }
+    if XNN_UNPREDICTABLE(vz3 > vdenorm_cutoff) {
+      vf3 = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx0 > 0.0f) {
       vf0 = vone - vf0;
@@ -153,36 +166,6 @@
       vf3 = vone - vf3;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 > vone_cutoff) {
-      vf0 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx1 > vone_cutoff) {
-      vf1 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx2 > vone_cutoff) {
-      vf2 = vone;
-    }
-    if XNN_UNPREDICTABLE(vx3 > vone_cutoff) {
-      vf3 = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx0 < vdenorm_cutoff) {
-      vf0 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx1 < vdenorm_cutoff) {
-      vf1 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx2 < vdenorm_cutoff) {
-      vf2 = 0.0f;
-    }
-    if XNN_UNPREDICTABLE(vx3 < vdenorm_cutoff) {
-      vf3 = 0.0f;
-    }
-
     y[0] = vf0;
     y[1] = vf1;
     y[2] = vf2;
@@ -239,23 +222,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = ve / (ve + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x12.c b/src/f32-sigmoid/gen/sse2-p5-div-x12.c
new file mode 100644
index 0000000..da8f0a2
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x12.c

@@ -0,0 +1,283 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse2_p5_div_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
+    __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
+    __m128 vm89AB = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx89AB)));
+
+    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
+    vf4567 = _mm_or_ps(_mm_and_ps(vf4567, vm4567), _mm_andnot_ps(vm4567, _mm_sub_ps(vone, vf4567)));
+    vf89AB = _mm_or_ps(_mm_and_ps(vf89AB, vm89AB), _mm_andnot_ps(vm89AB, _mm_sub_ps(vone, vf89AB)));
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+
+    x += 12;
+    y += 12;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x16.c b/src/f32-sigmoid/gen/sse2-p5-div-x16.c
index 5e232cf..65e43cd 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x16.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x16.c

@@ -27,10 +27,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -143,6 +141,13 @@
     __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
     __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
     __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
     __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
@@ -154,25 +159,6 @@
     vf89AB = _mm_or_ps(_mm_and_ps(vf89AB, vm89AB), _mm_andnot_ps(vm89AB, _mm_sub_ps(vone, vf89AB)));
     vfCDEF = _mm_or_ps(_mm_and_ps(vfCDEF, vmCDEF), _mm_andnot_ps(vmCDEF, _mm_sub_ps(vone, vfCDEF)));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vm4567 = _mm_cmpgt_ps(vx4567, vone_cutoff);
-    vm89AB = _mm_cmpgt_ps(vx89AB, vone_cutoff);
-    vmCDEF = _mm_cmpgt_ps(vxCDEF, vone_cutoff);
-
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
-    vf4567 = _mm_or_ps(_mm_and_ps(vone, vm4567), _mm_andnot_ps(vm4567, vf4567));
-    vf89AB = _mm_or_ps(_mm_and_ps(vone, vm89AB), _mm_andnot_ps(vm89AB, vf89AB));
-    vfCDEF = _mm_or_ps(_mm_and_ps(vone, vmCDEF), _mm_andnot_ps(vmCDEF, vfCDEF));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
-    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
-    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
-    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
-
     _mm_storeu_ps(y, vf0123);
     _mm_storeu_ps(y + 4, vf4567);
     _mm_storeu_ps(y + 8, vf89AB);
@@ -182,7 +168,7 @@
     y += 16;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -191,7 +177,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -199,59 +185,54 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
 
-    _mm_storeu_ps(y, vf0123);
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
 
     x += 4;
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -260,7 +241,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -268,59 +249,54 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
 
     if (n & (2 * sizeof(float))) {
-      _mm_storel_pi((__m64*) y, vf0123);
-      vf0123 = _mm_movehl_ps(vf0123, vf0123);
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
       y += 2;
     }
     if (n & (1 * sizeof(float))) {
-      _mm_store_ss(y, vf0123);
+      _mm_store_ss(y, vf);
     }
   }
 }

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x20.c b/src/f32-sigmoid/gen/sse2-p5-div-x20.c
new file mode 100644
index 0000000..916fff0
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x20.c

@@ -0,0 +1,321 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse2_p5_div_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+    const __m128 vxCDEF = _mm_loadu_ps(x + 12);
+    const __m128 vxGHIJ = _mm_loadu_ps(x + 16);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+    const __m128 vzCDEF = _mm_or_ps(vxCDEF, vsign_mask);
+    const __m128 vzGHIJ = _mm_or_ps(vxGHIJ, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vzCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vzGHIJ, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vzCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vzGHIJ);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 veCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 veGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+    __m128 vdCDEF = _mm_add_ps(veCDEF, vone);
+    __m128 vdGHIJ = _mm_add_ps(veGHIJ, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+    __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
+    __m128 vfGHIJ = _mm_div_ps(veGHIJ, vdGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vzGHIJ, vdenorm_cutoff), vfGHIJ);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
+    __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
+    __m128 vm89AB = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx89AB)));
+    __m128 vmCDEF = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxCDEF)));
+    __m128 vmGHIJ = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxGHIJ)));
+
+    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
+    vf4567 = _mm_or_ps(_mm_and_ps(vf4567, vm4567), _mm_andnot_ps(vm4567, _mm_sub_ps(vone, vf4567)));
+    vf89AB = _mm_or_ps(_mm_and_ps(vf89AB, vm89AB), _mm_andnot_ps(vm89AB, _mm_sub_ps(vone, vf89AB)));
+    vfCDEF = _mm_or_ps(_mm_and_ps(vfCDEF, vmCDEF), _mm_andnot_ps(vmCDEF, _mm_sub_ps(vone, vfCDEF)));
+    vfGHIJ = _mm_or_ps(_mm_and_ps(vfGHIJ, vmGHIJ), _mm_andnot_ps(vmGHIJ, _mm_sub_ps(vone, vfGHIJ)));
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+    _mm_storeu_ps(y + 12, vfCDEF);
+    _mm_storeu_ps(y + 16, vfGHIJ);
+
+    x += 20;
+    y += 20;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x24.c b/src/f32-sigmoid/gen/sse2-p5-div-x24.c
new file mode 100644
index 0000000..920204f
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x24.c

@@ -0,0 +1,340 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse2_p5_div_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+    const __m128 vxCDEF = _mm_loadu_ps(x + 12);
+    const __m128 vxGHIJ = _mm_loadu_ps(x + 16);
+    const __m128 vxKLMN = _mm_loadu_ps(x + 20);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+    const __m128 vzCDEF = _mm_or_ps(vxCDEF, vsign_mask);
+    const __m128 vzGHIJ = _mm_or_ps(vxGHIJ, vsign_mask);
+    const __m128 vzKLMN = _mm_or_ps(vxKLMN, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vzCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vzGHIJ, vlog2e), vmagic_bias);
+    __m128 vnKLMN = _mm_add_ps(_mm_mul_ps(vzKLMN, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+    const __m128 vsKLMN = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+    vnKLMN = _mm_sub_ps(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vzCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vzGHIJ);
+    __m128 vtKLMN = _mm_add_ps(_mm_mul_ps(vnKLMN, vminus_ln2_hi), vzKLMN);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+    vtKLMN = _mm_add_ps(_mm_mul_ps(vnKLMN, vminus_ln2_lo), vtKLMN);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+    __m128 vpKLMN = _mm_add_ps(_mm_mul_ps(vc5, vtKLMN), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+    vtKLMN = _mm_mul_ps(vtKLMN, vsKLMN);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 veCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 veGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+    __m128 veKLMN = _mm_add_ps(_mm_mul_ps(vtKLMN, vpKLMN), vsKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+    __m128 vdCDEF = _mm_add_ps(veCDEF, vone);
+    __m128 vdGHIJ = _mm_add_ps(veGHIJ, vone);
+    __m128 vdKLMN = _mm_add_ps(veKLMN, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+    __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
+    __m128 vfGHIJ = _mm_div_ps(veGHIJ, vdGHIJ);
+    __m128 vfKLMN = _mm_div_ps(veKLMN, vdKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vzGHIJ, vdenorm_cutoff), vfGHIJ);
+    vfKLMN = _mm_andnot_ps(_mm_cmplt_ps(vzKLMN, vdenorm_cutoff), vfKLMN);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
+    __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
+    __m128 vm89AB = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx89AB)));
+    __m128 vmCDEF = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxCDEF)));
+    __m128 vmGHIJ = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxGHIJ)));
+    __m128 vmKLMN = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vxKLMN)));
+
+    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
+    vf4567 = _mm_or_ps(_mm_and_ps(vf4567, vm4567), _mm_andnot_ps(vm4567, _mm_sub_ps(vone, vf4567)));
+    vf89AB = _mm_or_ps(_mm_and_ps(vf89AB, vm89AB), _mm_andnot_ps(vm89AB, _mm_sub_ps(vone, vf89AB)));
+    vfCDEF = _mm_or_ps(_mm_and_ps(vfCDEF, vmCDEF), _mm_andnot_ps(vmCDEF, _mm_sub_ps(vone, vfCDEF)));
+    vfGHIJ = _mm_or_ps(_mm_and_ps(vfGHIJ, vmGHIJ), _mm_andnot_ps(vmGHIJ, _mm_sub_ps(vone, vfGHIJ)));
+    vfKLMN = _mm_or_ps(_mm_and_ps(vfKLMN, vmKLMN), _mm_andnot_ps(vmKLMN, _mm_sub_ps(vone, vfKLMN)));
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+    _mm_storeu_ps(y + 12, vfCDEF);
+    _mm_storeu_ps(y + 16, vfGHIJ);
+    _mm_storeu_ps(y + 20, vfKLMN);
+
+    x += 24;
+    y += 24;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x4.c b/src/f32-sigmoid/gen/sse2-p5-div-x4.c
new file mode 100644
index 0000000..f543ef8
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x4.c

@@ -0,0 +1,175 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse2_p5_div_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse2-p5-div-x8.c b/src/f32-sigmoid/gen/sse2-p5-div-x8.c
index bf9bf94..21f6fde 100644
--- a/src/f32-sigmoid/gen/sse2-p5-div-x8.c
+++ b/src/f32-sigmoid/gen/sse2-p5-div-x8.c

@@ -27,10 +27,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -113,6 +111,11 @@
     __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
     __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
     __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
     __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
@@ -120,19 +123,6 @@
     vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
     vf4567 = _mm_or_ps(_mm_and_ps(vf4567, vm4567), _mm_andnot_ps(vm4567, _mm_sub_ps(vone, vf4567)));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vm4567 = _mm_cmpgt_ps(vx4567, vone_cutoff);
-
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
-    vf4567 = _mm_or_ps(_mm_and_ps(vone, vm4567), _mm_andnot_ps(vm4567, vf4567));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
-    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
-
     _mm_storeu_ps(y, vf0123);
     _mm_storeu_ps(y + 4, vf4567);
 
@@ -140,7 +130,7 @@
     y += 8;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -149,7 +139,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -157,59 +147,54 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
 
-    _mm_storeu_ps(y, vf0123);
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
 
     x += 4;
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -218,7 +203,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -226,59 +211,54 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-    vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-    vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+    vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
 
     if (n & (2 * sizeof(float))) {
-      _mm_storel_pi((__m64*) y, vf0123);
-      vf0123 = _mm_movehl_ps(vf0123, vf0123);
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
       y += 2;
     }
     if (n & (1 * sizeof(float))) {
-      _mm_store_ss(y, vf0123);
+      _mm_store_ss(y, vf);
     }
   }
 }

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x12.c b/src/f32-sigmoid/gen/sse41-p5-div-x12.c
new file mode 100644
index 0000000..7753dd3
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x12.c

@@ -0,0 +1,277 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse41_p5_div_x12(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
+    vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
+    vf89AB = _mm_blendv_ps(_mm_sub_ps(vone, vf89AB), vf89AB, vx89AB);
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+
+    x += 12;
+    y += 12;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x16.c b/src/f32-sigmoid/gen/sse41-p5-div-x16.c
index cf95fec..92f989a 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x16.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x16.c

@@ -27,10 +27,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -143,26 +141,19 @@
     __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
     __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
     vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
     vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
     vf89AB = _mm_blendv_ps(_mm_sub_ps(vone, vf89AB), vf89AB, vx89AB);
     vfCDEF = _mm_blendv_ps(_mm_sub_ps(vone, vfCDEF), vfCDEF, vxCDEF);
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
-    vf4567 = _mm_blendv_ps(vf4567, vone, _mm_cmpgt_ps(vx4567, vone_cutoff));
-    vf89AB = _mm_blendv_ps(vf89AB, vone, _mm_cmpgt_ps(vx89AB, vone_cutoff));
-    vfCDEF = _mm_blendv_ps(vfCDEF, vone, _mm_cmpgt_ps(vxCDEF, vone_cutoff));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
-    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
-    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
-    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
-
     _mm_storeu_ps(y, vf0123);
     _mm_storeu_ps(y + 4, vf4567);
     _mm_storeu_ps(y + 8, vf89AB);
@@ -172,7 +163,7 @@
     y += 16;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -181,7 +172,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -189,57 +180,53 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
 
-    _mm_storeu_ps(y, vf0123);
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
 
     x += 4;
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -248,7 +235,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -256,57 +243,53 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
 
     if (n & (2 * sizeof(float))) {
-      _mm_storel_pi((__m64*) y, vf0123);
-      vf0123 = _mm_movehl_ps(vf0123, vf0123);
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
       y += 2;
     }
     if (n & (1 * sizeof(float))) {
-      _mm_store_ss(y, vf0123);
+      _mm_store_ss(y, vf);
     }
   }
 }

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x20.c b/src/f32-sigmoid/gen/sse41-p5-div-x20.c
new file mode 100644
index 0000000..a58a9cd
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x20.c

@@ -0,0 +1,313 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse41_p5_div_x20(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 20 * sizeof(float); n -= 20 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+    const __m128 vxCDEF = _mm_loadu_ps(x + 12);
+    const __m128 vxGHIJ = _mm_loadu_ps(x + 16);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+    const __m128 vzCDEF = _mm_or_ps(vxCDEF, vsign_mask);
+    const __m128 vzGHIJ = _mm_or_ps(vxGHIJ, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vzCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vzGHIJ, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vzCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vzGHIJ);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 veCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 veGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+    __m128 vdCDEF = _mm_add_ps(veCDEF, vone);
+    __m128 vdGHIJ = _mm_add_ps(veGHIJ, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+    __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
+    __m128 vfGHIJ = _mm_div_ps(veGHIJ, vdGHIJ);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vzGHIJ, vdenorm_cutoff), vfGHIJ);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
+    vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
+    vf89AB = _mm_blendv_ps(_mm_sub_ps(vone, vf89AB), vf89AB, vx89AB);
+    vfCDEF = _mm_blendv_ps(_mm_sub_ps(vone, vfCDEF), vfCDEF, vxCDEF);
+    vfGHIJ = _mm_blendv_ps(_mm_sub_ps(vone, vfGHIJ), vfGHIJ, vxGHIJ);
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+    _mm_storeu_ps(y + 12, vfCDEF);
+    _mm_storeu_ps(y + 16, vfGHIJ);
+
+    x += 20;
+    y += 20;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x24.c b/src/f32-sigmoid/gen/sse41-p5-div-x24.c
new file mode 100644
index 0000000..71979d3
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x24.c

@@ -0,0 +1,331 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse41_p5_div_x24(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
+    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx4567 = _mm_loadu_ps(x + 4);
+    const __m128 vx89AB = _mm_loadu_ps(x + 8);
+    const __m128 vxCDEF = _mm_loadu_ps(x + 12);
+    const __m128 vxGHIJ = _mm_loadu_ps(x + 16);
+    const __m128 vxKLMN = _mm_loadu_ps(x + 20);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
+    const __m128 vz89AB = _mm_or_ps(vx89AB, vsign_mask);
+    const __m128 vzCDEF = _mm_or_ps(vxCDEF, vsign_mask);
+    const __m128 vzGHIJ = _mm_or_ps(vxGHIJ, vsign_mask);
+    const __m128 vzKLMN = _mm_or_ps(vxKLMN, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vzCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vzGHIJ, vlog2e), vmagic_bias);
+    __m128 vnKLMN = _mm_add_ps(_mm_mul_ps(vzKLMN, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+    const __m128 vsKLMN = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnKLMN), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+    vnKLMN = _mm_sub_ps(vnKLMN, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vzCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vzGHIJ);
+    __m128 vtKLMN = _mm_add_ps(_mm_mul_ps(vnKLMN, vminus_ln2_hi), vzKLMN);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+    vtKLMN = _mm_add_ps(_mm_mul_ps(vnKLMN, vminus_ln2_lo), vtKLMN);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+    __m128 vpKLMN = _mm_add_ps(_mm_mul_ps(vc5, vtKLMN), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+    vpKLMN = _mm_add_ps(_mm_mul_ps(vpKLMN, vtKLMN), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+    vtKLMN = _mm_mul_ps(vtKLMN, vsKLMN);
+
+    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 ve4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 ve89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 veCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 veGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+    __m128 veKLMN = _mm_add_ps(_mm_mul_ps(vtKLMN, vpKLMN), vsKLMN);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd4567 = _mm_add_ps(ve4567, vone);
+    __m128 vd89AB = _mm_add_ps(ve89AB, vone);
+    __m128 vdCDEF = _mm_add_ps(veCDEF, vone);
+    __m128 vdGHIJ = _mm_add_ps(veGHIJ, vone);
+    __m128 vdKLMN = _mm_add_ps(veKLMN, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
+    __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
+    __m128 vf89AB = _mm_div_ps(ve89AB, vd89AB);
+    __m128 vfCDEF = _mm_div_ps(veCDEF, vdCDEF);
+    __m128 vfGHIJ = _mm_div_ps(veGHIJ, vdGHIJ);
+    __m128 vfKLMN = _mm_div_ps(veKLMN, vdKLMN);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vz89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vzCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vzGHIJ, vdenorm_cutoff), vfGHIJ);
+    vfKLMN = _mm_andnot_ps(_mm_cmplt_ps(vzKLMN, vdenorm_cutoff), vfKLMN);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
+    vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
+    vf89AB = _mm_blendv_ps(_mm_sub_ps(vone, vf89AB), vf89AB, vx89AB);
+    vfCDEF = _mm_blendv_ps(_mm_sub_ps(vone, vfCDEF), vfCDEF, vxCDEF);
+    vfGHIJ = _mm_blendv_ps(_mm_sub_ps(vone, vfGHIJ), vfGHIJ, vxGHIJ);
+    vfKLMN = _mm_blendv_ps(_mm_sub_ps(vone, vfKLMN), vfKLMN, vxKLMN);
+
+    _mm_storeu_ps(y, vf0123);
+    _mm_storeu_ps(y + 4, vf4567);
+    _mm_storeu_ps(y + 8, vf89AB);
+    _mm_storeu_ps(y + 12, vfCDEF);
+    _mm_storeu_ps(y + 16, vfGHIJ);
+    _mm_storeu_ps(y + 20, vfKLMN);
+
+    x += 24;
+    y += 24;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x4.c b/src/f32-sigmoid/gen/sse41-p5-div-x4.c
new file mode 100644
index 0000000..54b600a
--- /dev/null
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x4.c

@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-sigmoid/sse-p5-div.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__sse41_p5_div_x4(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which sigmoidf(x) is normalized.
+  // This number is also the smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+  const __m128 vone = _mm_set1_ps(1.0f);
+  const __m128 vsign_mask = _mm_set1_ps(-0.0f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    if (n & (2 * sizeof(float))) {
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      _mm_store_ss(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/gen/sse41-p5-div-x8.c b/src/f32-sigmoid/gen/sse41-p5-div-x8.c
index a2c16a5..6a9e9db 100644
--- a/src/f32-sigmoid/gen/sse41-p5-div-x8.c
+++ b/src/f32-sigmoid/gen/sse41-p5-div-x8.c

@@ -27,10 +27,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -113,20 +111,15 @@
     __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
     __m128 vf4567 = _mm_div_ps(ve4567, vd4567);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
     vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
     vf4567 = _mm_blendv_ps(_mm_sub_ps(vone, vf4567), vf4567, vx4567);
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
-    vf4567 = _mm_blendv_ps(vf4567, vone, _mm_cmpgt_ps(vx4567, vone_cutoff));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
-    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
-
     _mm_storeu_ps(y, vf0123);
     _mm_storeu_ps(y + 4, vf4567);
 
@@ -134,7 +127,7 @@
     y += 8;
   }
   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -143,7 +136,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -151,57 +144,53 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
 
-    _mm_storeu_ps(y, vf0123);
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+
+    _mm_storeu_ps(y, vf);
 
     x += 4;
     y += 4;
   }
   if XNN_UNLIKELY(n != 0) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -210,7 +199,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -218,57 +207,53 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
 
     if (n & (2 * sizeof(float))) {
-      _mm_storel_pi((__m64*) y, vf0123);
-      vf0123 = _mm_movehl_ps(vf0123, vf0123);
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
       y += 2;
     }
     if (n & (1 * sizeof(float))) {
-      _mm_store_ss(y, vf0123);
+      _mm_store_ss(y, vf);
     }
   }
 }

diff --git a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in
index 434f86e..7e66577 100644
--- a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in
+++ b/src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in

@@ -5,7 +5,7 @@
 
 $assert BATCH_TILE % 4 == 0
 $assert BATCH_TILE >= 4
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 #include <arm_neon.h>

diff --git a/src/f32-sigmoid/neon-lut2048-p1.c.in b/src/f32-sigmoid/neon-lut2048-p1.c.in
new file mode 100644
index 0000000..58afd54
--- /dev/null
+++ b/src/f32-sigmoid/neon-lut2048-p1.c.in

@@ -0,0 +1,379 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$assert DIV_ALGO in ["div", "nr2fma", "nr2recps", "nr1recps1fma"]
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+extern XNN_INTERNAL const float xnn_table_exp2_k_over_2048[2048];
+
+void xnn_f32_sigmoid_ukernel__${"neonfma" if FMA else "neon"}_lut2048_p1_${DIV_ALGO}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  $if FMA:
+    const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+    const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
+  $else:
+    // Last 18 bits are zeroes
+    const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.600000p-12f);
+    const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7217F8p-19f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  $if BATCH_TILE > 4:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
+
+      // General structure of the algorithm:
+      //           / exp(x) / (1 + exp(x)) if x <= 0
+      //   f[x] := 
+      //           \ 1 - f[-x] if x >= 0
+      //
+      // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+      // then replace result with 1 - f[-z] if x >= 0.
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vz${ABC[N:N+4]} = vabsq_f32(vx${ABC[N:N+4]});
+
+      // Compute reduced argument n := round(-z * 2048 / log(2)).
+      // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+      // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+      // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+      // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+      // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+      // for such inputs at the very end of the algorithm.
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vn${ABC[N:N+4]} = ${VMULADDQ_F32}(vmagic_bias, vz${ABC[N:N+4]}, vminus_log2e_x2048);
+
+      // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+      // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+      // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+      // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from the table using the 6 low bits of n, as integer. Note that the
+      //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+      // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+      //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+      //    and thus the adjusted exponent is not lower than -126.
+      //
+      // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+      $for N in range(0, BATCH_TILE, 4):
+        const int32x4_t ve${ABC[N:N+4]} = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+      // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+      $for N in range(0, BATCH_TILE, 4):
+        const uint64x2_t vidx${ABC[N:N+4]} = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), vindex_mask));
+
+      $for N in range(0, BATCH_TILE, 4):
+        const uint64_t vidx${ABC[N:N+2]} = vgetq_lane_u64(vidx${ABC[N:N+4]}, 0);
+        const uint64_t vidx${ABC[N+2:N+4]} = vgetq_lane_u64(vidx${ABC[N:N+4]}, 1);
+        float32x2_t vl${ABC[N:N+2]} = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx${ABC[N:N+2]}]);
+        float32x2_t vl${ABC[N+2:N+4]} = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx${ABC[N+2:N+4]}]);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vl${ABC[N:N+2]} = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx${ABC[N:N+2]} >> 32)], vl${ABC[N:N+2]}, 1);
+        vl${ABC[N+2:N+4]} = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx${ABC[N+2:N+4]} >> 32)], vl${ABC[N+2:N+4]}, 1);
+        const float32x4_t vl${ABC[N:N+4]} = vcombine_f32(vl${ABC[N:N+2]}, vl${ABC[N+2:N+4]});
+
+      // Adjust exponent of the value l fetched from the table to get the final s value.
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vs${ABC[N:N+4]} = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl${ABC[N:N+4]}), ve${ABC[N:N+4]}));
+
+      // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+      $for N in range(0, BATCH_TILE, 4):
+        vn${ABC[N:N+4]} = vsubq_f32(vn${ABC[N:N+4]}, vmagic_bias);
+
+      // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+      // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_o2048_hi);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_o2048_lo);
+
+      // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+      //   P1(t) = 1 + t * c1
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vp${ABC[N:N+4]} = vmulq_f32(vt${ABC[N:N+4]}, vc1);
+
+      // Reconstruct the exp(-z) value:
+      //   y = s * (1 + t * c1)
+      //     = s + s * (t * c1))
+      //     = s + s * p
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vy${ABC[N:N+4]} = ${VMULADDQ_F32}(vs${ABC[N:N+4]}, vs${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+      // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vd${ABC[N:N+4]} = vaddq_f32(vy${ABC[N:N+4]}, vone);
+
+      $if DIV_ALGO == "div":
+        // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vf${ABC[N:N+4]} = vdivq_f32(vy${ABC[N:N+4]}, vd${ABC[N:N+4]});
+      $else:
+        // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+        // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+        // Thus the reciprocal of the denominator never overflows.
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vr${ABC[N:N+4]} = vrecpeq_f32(vd${ABC[N:N+4]});
+
+        $if DIV_ALGO == "nr2fma":
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+        $else:
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+
+        $if DIV_ALGO == "nr2recps":
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+        $else:
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+
+        // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vf${ABC[N:N+4]} = vmulq_f32(vy${ABC[N:N+4]}, vr${ABC[N:N+4]});
+
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf${ABC[N:N+4]}), vcagtq_f32(vx${ABC[N:N+4]}, vdenorm_cutoff)));
+
+      // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+      $for N in range(0, BATCH_TILE, 4):
+        const uint32x4_t vm${ABC[N:N+4]} = vcltq_f32(vx${ABC[N:N+4]}, vmovq_n_s32(0.0f));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = vbslq_f32(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}, vsubq_f32(vone, vf${ABC[N:N+4]}));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vst1q_f32(y, vf${ABC[N:N+4]}); y += 4;
+    }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048_hi);
+    vt = ${VMULADDQ_F32}(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = ${VMULADDQ_F32}(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    $if DIV_ALGO == "div":
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vdivq_f32(vy, vd);
+    $else:
+      // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+      // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+      // Thus the reciprocal of the denominator never overflows.
+      float32x4_t vr = vrecpeq_f32(vd);
+
+      $if DIV_ALGO == "nr2fma":
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+      $else:
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+      $if DIV_ALGO == "nr2recps":
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+      $else:
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx_lo = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx_hi = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl_lo = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_lo]);
+    float32x2_t vl_hi = vld1_dup_f32(&xnn_table_exp2_k_over_2048[(uint32_t) vidx_hi]);
+    vl_lo = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_lo >> 32)], vl_lo, 1);
+    vl_hi = vld1_lane_f32(&xnn_table_exp2_k_over_2048[(uint32_t) (vidx_hi >> 32)], vl_hi, 1);
+    const float32x4_t vl = vcombine_f32(vl_lo, vl_hi);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_o2048_hi);
+    vt = ${VMULADDQ_F32}(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = ${VMULADDQ_F32}(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    $if DIV_ALGO == "div":
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vdivq_f32(vy, vd);
+    $else:
+      // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+      // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+      // Thus the reciprocal of the denominator never overflows.
+      float32x4_t vr = vrecpeq_f32(vd);
+
+      $if DIV_ALGO == "nr2fma":
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+      $else:
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+      $if DIV_ALGO == "nr2recps":
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+      $else:
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/neon-p5.c.in b/src/f32-sigmoid/neon-p5.c.in
new file mode 100644
index 0000000..5373463
--- /dev/null
+++ b/src/f32-sigmoid/neon-p5.c.in

@@ -0,0 +1,326 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$assert DIV_ALGO in ["div", "nr2fma", "nr2recps", "nr1recps1fma"]
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__${"neonfma" if FMA else "neon"}_p5_${DIV_ALGO}_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  $if FMA:
+    const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
+    const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
+  $else:
+    // Last 7 bits are zeroes
+    const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+    const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  $if BATCH_TILE > 4:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
+
+      // General structure of the algorithm:
+      //           / exp(x) / (1 + exp(x)) if x <= 0
+      //   f[x] := 
+      //           \ 1 - f[-x] if x >= 0
+      //
+      // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+      // then replace result with 1 - f[z] if x >= 0.
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vz${ABC[N:N+4]} = vabsq_f32(vx${ABC[N:N+4]});
+
+      // Compute reduced argument n := round(-z / log(2)).
+      // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+      // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+      // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+      // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+      // anyway. We fixup the result for such inputs at the very end of the algorithm.
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vn${ABC[N:N+4]} = ${VMULADDQ_F32}(vmagic_bias, vz${ABC[N:N+4]}, vminus_log2e);
+
+      // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+      // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+      $for N in range(0, BATCH_TILE, 4):
+        const float32x4_t vs${ABC[N:N+4]} = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), 23));
+
+      // Subtract the large number back to get final n := round(-z / log(2)).
+      $for N in range(0, BATCH_TILE, 4):
+        vn${ABC[N:N+4]} = vsubq_f32(vn${ABC[N:N+4]}, vmagic_bias);
+
+      // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+      // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_hi);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = ${VMULADDQ_F32}(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_lo);
+
+      // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vp${ABC[N:N+4]} = ${VMULADDQ_F32}(vc4, vc5, vt${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = ${VMULADDQ_F32}(vc3, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = ${VMULADDQ_F32}(vc2, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = ${VMULADDQ_F32}(vc1, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+      // Reconstruct the exp(-z) value:
+      //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+      //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+      //     = s + (t * s) * p
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = vmulq_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t ve${ABC[N:N+4]} = ${VMULADDQ_F32}(vs${ABC[N:N+4]}, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+      // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+      $for N in range(0, BATCH_TILE, 4):
+        float32x4_t vd${ABC[N:N+4]} = vaddq_f32(ve${ABC[N:N+4]}, vone);
+
+      $if DIV_ALGO == "div":
+        // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vf${ABC[N:N+4]} = vdivq_f32(ve${ABC[N:N+4]}, vd${ABC[N:N+4]});
+      $else:
+        // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+        // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+        // Thus the reciprocal of the denominator never overflows.
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vr${ABC[N:N+4]} = vrecpeq_f32(vd${ABC[N:N+4]});
+
+        $if DIV_ALGO == "nr2fma":
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+        $else:
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+
+        $if DIV_ALGO == "nr2recps":
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vmulq_f32(vr${ABC[N:N+4]}, vrecpsq_f32(vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+        $else:
+          $for N in range(0, BATCH_TILE, 4):
+            vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
+
+        // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+        $for N in range(0, BATCH_TILE, 4):
+          float32x4_t vf${ABC[N:N+4]} = vmulq_f32(ve${ABC[N:N+4]}, vr${ABC[N:N+4]});
+
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf${ABC[N:N+4]}), vcagtq_f32(vx${ABC[N:N+4]}, vdenorm_cutoff)));
+
+      // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+      $for N in range(0, BATCH_TILE, 4):
+        const uint32x4_t vm${ABC[N:N+4]} = vcltq_f32(vx${ABC[N:N+4]}, vmovq_n_f32(0.0f));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = vbslq_f32(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}, vsubq_f32(vone, vf${ABC[N:N+4]}));
+
+      $for N in range(0, BATCH_TILE, 4):
+        vst1q_f32(y, vf${ABC[N:N+4]}); y += 4;
+    }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(x); x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_hi);
+    vt = ${VMULADDQ_F32}(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = ${VMULADDQ_F32}(vc4, vc5, vt);
+    vp = ${VMULADDQ_F32}(vc3, vp, vt);
+    vp = ${VMULADDQ_F32}(vc2, vp, vt);
+    vp = ${VMULADDQ_F32}(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = ${VMULADDQ_F32}(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    $if DIV_ALGO == "div":
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vdivq_f32(ve, vd);
+    $else:
+      // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+      // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+      // Thus the reciprocal of the denominator never overflows.
+      float32x4_t vr = vrecpeq_f32(vd);
+
+      $if DIV_ALGO == "nr2fma":
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+      $else:
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+      $if DIV_ALGO == "nr2recps":
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+      $else:
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_s32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(y, vf); y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float32x4_t vx = vld1q_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[z] if x <= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = ${VMULADDQ_F32}(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get final n := round(-z / log(2)).
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = ${VMULADDQ_F32}(vz, vn, vln2_hi);
+    vt = ${VMULADDQ_F32}(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+    float32x4_t vp = ${VMULADDQ_F32}(vc4, vc5, vt);
+    vp = ${VMULADDQ_F32}(vc3, vp, vt);
+    vp = ${VMULADDQ_F32}(vc2, vp, vt);
+    vp = ${VMULADDQ_F32}(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = ${VMULADDQ_F32}(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    $if DIV_ALGO == "div":
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vdivq_f32(ve, vd);
+    $else:
+      // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+      // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+      // Thus the reciprocal of the denominator never overflows.
+      float32x4_t vr = vrecpeq_f32(vd);
+
+      $if DIV_ALGO == "nr2fma":
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+      $else:
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+      $if DIV_ALGO == "nr2recps":
+        vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+      $else:
+        vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
+
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_f32(vx, vmovq_n_f32(0.0f));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    float32x2_t vf_lo = vget_low_f32(vf);
+    if (n & (2 * sizeof(float))) {
+      vst1_f32(y, vf_lo); y += 2;
+      vf_lo = vget_high_f32(vf);
+    }
+    if (n & (1 * sizeof(float))) {
+      vst1_lane_f32(y, vf_lo, 0);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/neonfma-p5-nr2fma.c.in b/src/f32-sigmoid/neonfma-p5-nr2fma.c.in
deleted file mode 100644
index fe952bf..0000000
--- a/src/f32-sigmoid/neonfma-p5-nr2fma.c.in
+++ /dev/null

@@ -1,296 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert BATCH_TILE % 4 == 0
-$assert BATCH_TILE >= 4
-$ABC = "0123456789ABCDEFGHIJKLMN"
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vunary.h>
-
-
-void xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x${BATCH_TILE}(
-    size_t n,
-    const float* x,
-    float* y,
-    const void* params)
-{
-  assert(n % sizeof(float) == 0);
-
-  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
-  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
-  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
-  const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
-  const float32x4_t vone = vmovq_n_f32(1.0f);
-
-  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
-  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
-  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
-  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
-  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
-
-  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
-    $for N in range(0, BATCH_TILE, 4):
-      const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
-
-    // General structure of the algorithm:
-    //           / exp(x) / (1 + exp(x)) if x <= 0
-    //   f[x] := 
-    //           \ 1 - f[-x] if x >= 0
-    //
-    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
-    // then replace result with 1 - f[z] if x >= 0.
-    $for N in range(0, BATCH_TILE, 4):
-      const float32x4_t vz${ABC[N:N+4]} = vabsq_f32(vx${ABC[N:N+4]});
-
-    // Compute reduced argument n := round(-z / log(2)).
-    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
-    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
-    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
-    // anyway. We fixup the result for such inputs at the very end of the algorithm.
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vn${ABC[N:N+4]} = vfmaq_f32(vmagic_bias, vz${ABC[N:N+4]}, vminus_log2e);
-
-    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
-    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
-    $for N in range(0, BATCH_TILE, 4):
-      const float32x4_t vs${ABC[N:N+4]} = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn${ABC[N:N+4]}), 23));
-
-    // Subtract the large number back to get final n := round(-z / log(2)).
-    $for N in range(0, BATCH_TILE, 4):
-      vn${ABC[N:N+4]} = vsubq_f32(vn${ABC[N:N+4]}, vmagic_bias);
-
-    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
-    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vt${ABC[N:N+4]} = vfmaq_f32(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_hi);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vt${ABC[N:N+4]} = vfmaq_f32(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_lo);
-
-    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vp${ABC[N:N+4]} = vfmaq_f32(vc4, vc5, vt${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = vfmaq_f32(vc3, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = vfmaq_f32(vc2, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = vfmaq_f32(vc1, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
-    // Reconstruct the exp(z) value:
-    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
-    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
-    //     = s + (t * s) * p
-    $for N in range(0, BATCH_TILE, 4):
-      vt${ABC[N:N+4]} = vmulq_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t ve${ABC[N:N+4]} = vfmaq_f32(vs${ABC[N:N+4]}, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vd${ABC[N:N+4]} = vaddq_f32(ve${ABC[N:N+4]}, vone);
-
-    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
-    // Thus the reciprocal of the denominator never overflows.
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vr${ABC[N:N+4]} = vrecpeq_f32(vd${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
-
-    $for N in range(0, BATCH_TILE, 4):
-      vr${ABC[N:N+4]} = vfmaq_f32(vr${ABC[N:N+4]}, vr${ABC[N:N+4]}, vfmsq_f32(vone, vr${ABC[N:N+4]}, vd${ABC[N:N+4]}));
-
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    $for N in range(0, BATCH_TILE, 4):
-      float32x4_t vf${ABC[N:N+4]} = vmulq_f32(ve${ABC[N:N+4]}, vr${ABC[N:N+4]});
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    $for N in range(0, BATCH_TILE, 4):
-      const uint32x4_t vm${ABC[N:N+4]} = vcltq_s32(vreinterpretq_s32_f32(vx${ABC[N:N+4]}), vmovq_n_s32(0));
-
-    $for N in range(0, BATCH_TILE, 4):
-      vf${ABC[N:N+4]} = vbslq_f32(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}, vsubq_f32(vone, vf${ABC[N:N+4]}));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    $for N in range(0, BATCH_TILE, 4):
-      vf${ABC[N:N+4]} = vbslq_f32(vcgtq_f32(vx${ABC[N:N+4]}, vone_cutoff), vone, vf${ABC[N:N+4]});
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    $for N in range(0, BATCH_TILE, 4):
-      vf${ABC[N:N+4]} = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf${ABC[N:N+4]}), vcltq_f32(vx${ABC[N:N+4]}, vdenorm_cutoff)));
-
-    $for N in range(0, BATCH_TILE, 4):
-      vst1q_f32(y, vf${ABC[N:N+4]}); y += 4;
-  }
-  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-    const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-
-    // General structure of the algorithm:
-    //           / exp(x) / (1 + exp(x)) if x <= 0
-    //   f[x] := 
-    //           \ 1 - f[-x] if x >= 0
-    //
-    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
-    // then replace result with 1 - f[z] if x <= 0.
-    const float32x4_t vz0123 = vabsq_f32(vx0123);
-
-    // Compute reduced argument n := round(-z / log(2)).
-    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
-    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
-    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
-    // anyway. We fixup the result for such inputs at the very end of the algorithm.
-    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
-
-    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
-    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
-    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
-
-    // Subtract the large number back to get final n := round(-z / log(2)).
-    vn0123 = vsubq_f32(vn0123, vmagic_bias);
-
-    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
-    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
-    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
-
-    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
-    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
-    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
-
-    // Reconstruct the exp(z) value:
-    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
-    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
-    //     = s + (t * s) * p
-    vt0123 = vmulq_f32(vt0123, vs0123);
-    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
-
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
-
-    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
-    // Thus the reciprocal of the denominator never overflows.
-    float32x4_t vr0123 = vrecpeq_f32(vd0123);
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    const uint32x4_t vm0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
-    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vbslq_f32(vcgtq_f32(vx0123, vone_cutoff), vone, vf0123);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
-
-    vst1q_f32(y, vf0123); y += 4;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const float32x4_t vx0123 = vld1q_f32(x);
-
-    // General structure of the algorithm:
-    //           / exp(x) / (1 + exp(x)) if x <= 0
-    //   f[x] := 
-    //           \ 1 - f[-x] if x >= 0
-    //
-    // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
-    // then replace result with 1 - f[z] if x <= 0.
-    const float32x4_t vz0123 = vabsq_f32(vx0123);
-
-    // Compute reduced argument n := round(-z / log(2)).
-    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
-    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
-    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
-    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
-    // anyway. We fixup the result for such inputs at the very end of the algorithm.
-    float32x4_t vn0123 = vfmaq_f32(vmagic_bias, vz0123, vminus_log2e);
-
-    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
-    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
-    const float32x4_t vs0123 = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn0123), 23));
-
-    // Subtract the large number back to get final n := round(-z / log(2)).
-    vn0123 = vsubq_f32(vn0123, vmagic_bias);
-
-    // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
-    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    float32x4_t vt0123 = vfmaq_f32(vz0123, vn0123, vln2_hi);
-    vt0123 = vfmaq_f32(vt0123, vn0123, vln2_lo);
-
-    // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
-    float32x4_t vp0123 = vfmaq_f32(vc4, vc5, vt0123);
-    vp0123 = vfmaq_f32(vc3, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc2, vp0123, vt0123);
-    vp0123 = vfmaq_f32(vc1, vp0123, vt0123);
-
-    // Reconstruct the exp(z) value:
-    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
-    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
-    //     = s + (t * s) * p
-    vt0123 = vmulq_f32(vt0123, vs0123);
-    float32x4_t ve0123 = vfmaq_f32(vs0123, vp0123, vt0123);
-
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    float32x4_t vd0123 = vaddq_f32(ve0123, vone);
-
-    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
-    // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
-    // Thus the reciprocal of the denominator never overflows.
-    float32x4_t vr0123 = vrecpeq_f32(vd0123);
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-    vr0123 = vfmaq_f32(vr0123, vr0123, vfmsq_f32(vone, vr0123, vd0123));
-
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    float32x4_t vf0123 = vmulq_f32(ve0123, vr0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    const uint32x4_t vm0123 = vcltq_s32(vreinterpretq_s32_f32(vx0123), vmovq_n_s32(0));
-    vf0123 = vbslq_f32(vm0123, vf0123, vsubq_f32(vone, vf0123));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vbslq_f32(vcgtq_f32(vx0123, vone_cutoff), vone, vf0123);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf0123), vcltq_f32(vx0123, vdenorm_cutoff)));
-
-    float32x2_t vf01 = vget_low_f32(vf0123);
-    if (n & (2 * sizeof(float))) {
-      vst1_f32(y, vf01); y += 2;
-      vf01 = vget_high_f32(vf0123);
-    }
-    if (n & (1 * sizeof(float))) {
-      vst1_lane_f32(y, vf01, 0);
-    }
-  }
-}

diff --git a/src/f32-sigmoid/psimd-p5-div.c.in b/src/f32-sigmoid/psimd-p5-div.c.in
new file mode 100644
index 0000000..2a02afa
--- /dev/null
+++ b/src/f32-sigmoid/psimd-p5-div.c.in

@@ -0,0 +1,251 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 4 == 0
+$assert BATCH_TILE >= 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f32_sigmoid_ukernel__psimd_p5_div_x${BATCH_TILE}(
+    size_t n,
+    const float* x,
+    float* y,
+    const void* params)
+{
+  assert(n % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  $if BATCH_TILE > 4:
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      const psimd_f32 vx${ABC[0:4]} = psimd_load_f32(x);
+      $for N in range(4, BATCH_TILE, 4):
+        const psimd_f32 vx${ABC[N:N+4]} = psimd_load_f32(x + ${N});
+      x += ${BATCH_TILE};
+
+      // General structure of the algorithm:
+      //           / exp(x) / (1 + exp(x)) if x <= 0
+      //   f[x] := 
+      //           \ 1 - f[-x] if x >= 0
+      //
+      // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+      // then replace result with 1 - f[-z] if x >= 0.
+      $for N in range(0, BATCH_TILE, 4):
+        const psimd_f32 vz${ABC[N:N+4]} = psimd_abs_f32(vx${ABC[N:N+4]});
+
+      // Compute reduced argument n := round(-z / log(2)).
+      // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+      // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+      // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+      // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+      // anyway. We fixup the result for such inputs at the very end of the algorithm.
+      $for N in range(0, BATCH_TILE, 4):
+        psimd_f32 vn${ABC[N:N+4]} = psimd_qfma_f32(vmagic_bias, vz${ABC[N:N+4]}, vminus_log2e);
+
+      // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+      // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+      $for N in range(0, BATCH_TILE, 4):
+        const psimd_f32 vs${ABC[N:N+4]} = (psimd_f32) ((psimd_u32) vn${ABC[N:N+4]} << 23);
+
+      // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+      $for N in range(0, BATCH_TILE, 4):
+        vn${ABC[N:N+4]} = psimd_sub_f32(vn${ABC[N:N+4]}, vmagic_bias);
+
+      // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+      // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+      $for N in range(0, BATCH_TILE, 4):
+        psimd_f32 vt${ABC[N:N+4]} = psimd_qfma_f32(vz${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_hi);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = psimd_qfma_f32(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vln2_lo);
+
+      // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+      //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+      $for N in range(0, BATCH_TILE, 4):
+        psimd_f32 vp${ABC[N:N+4]} = psimd_qfma_f32(vc4, vt${ABC[N:N+4]}, vc5);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = psimd_qfma_f32(vc3, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = psimd_qfma_f32(vc2, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = psimd_qfma_f32(vc1, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+      // Reconstruct the exp(-z) value:
+      //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+      //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+      //     = s + (t * s) * p
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = psimd_mul_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        const psimd_f32 ve${ABC[N:N+4]} = psimd_qfma_f32(vs${ABC[N:N+4]}, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+      // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+      $for N in range(0, BATCH_TILE, 4):
+        psimd_f32 vf${ABC[N:N+4]} = psimd_div_f32(ve${ABC[N:N+4]}, psimd_add_f32(ve${ABC[N:N+4]}, vone));
+
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = psimd_andnotmask_f32(vz${ABC[N:N+4]} > vdenorm_cutoff, vf${ABC[N:N+4]});
+
+      // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = psimd_signblend_f32(vx${ABC[N:N+4]}, vf${ABC[N:N+4]}, psimd_sub_f32(vone, vf${ABC[N:N+4]}));
+
+      psimd_store_f32(y, vf${ABC[0:4]});
+      $for N in range(4, BATCH_TILE, 4):
+        psimd_store_f32(y + ${N}, vf${ABC[N:N+4]});
+      y += ${BATCH_TILE};
+    }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    x += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(y, vf);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const psimd_f32 vx = psimd_load_f32(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs above denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    if (n & (2 * sizeof(float))) {
+      psimd_store2_f32(y, vf);
+      vf = psimd_concat_hi_f32(vf, vf);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      psimd_store1_f32(y, vf);
+    }
+  }
+}

diff --git a/src/f32-sigmoid/scalar-lut2048-p1-div.c.in b/src/f32-sigmoid/scalar-lut2048-p1-div.c.in
index f90b934..40a483c 100644
--- a/src/f32-sigmoid/scalar-lut2048-p1-div.c.in
+++ b/src/f32-sigmoid/scalar-lut2048-p1-div.c.in

@@ -4,7 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $assert BATCH_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 #include <math.h>
 
@@ -26,11 +26,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x2048 = -0x1.715476p11f;
   // Last 18 bits are zeroes
   const float vln2_o2048_hi = 0x1.600000p-12f;
@@ -115,26 +113,19 @@
       $for N in range(BATCH_TILE):
         float vf${N} = vy${N} / (vy${N} + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(BATCH_TILE):
+        if XNN_UNPREDICTABLE(vz${N} > vdenorm_cutoff) {
+          vf${N} = 0.0f;
+        }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       $for N in range(BATCH_TILE):
         if XNN_UNPREDICTABLE(vx${N} > 0.0f) {
           vf${N} = vone - vf${N};
         }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} > vone_cutoff) {
-          vf${N} = vone;
-        }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) {
-          vf${N} = 0.0f;
-        }
-
       $for N in range(BATCH_TILE):
         y[${N}] = vf${N};
       y += ${BATCH_TILE};
@@ -199,23 +190,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);
@@ -280,23 +265,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y = vf;
     }
   $else:
@@ -360,23 +339,17 @@
         // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
         float vf = vy / (vy + vone);
 
+        // For inputs above denormal cutoff, replace output with +0.0f.
+        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+        if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+          vf = 0.0f;
+        }
+
         // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
         if XNN_UNPREDICTABLE(vx > 0.0f) {
           vf = vone - vf;
         }
 
-        // For inputs above 1.0 cutoff, replace output with 1.0.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-          vf = vone;
-        }
-
-        // For inputs below denormal cutoff, replace output with +0.0f.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-          vf = 0.0f;
-        }
-
         *y++ = vf;
 
         n -= sizeof(float);

diff --git a/src/f32-sigmoid/scalar-lut64-p2-div.c.in b/src/f32-sigmoid/scalar-lut64-p2-div.c.in
index 44e2992..1198f9c 100644
--- a/src/f32-sigmoid/scalar-lut64-p2-div.c.in
+++ b/src/f32-sigmoid/scalar-lut64-p2-div.c.in

@@ -4,7 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $assert BATCH_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 #include <math.h>
 
@@ -26,11 +26,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x64 = -0x1.715476p6f;
   // Last 13 bits are zeroes
   const float vln2_o64_hi =  0x1.630000p-7f;
@@ -119,26 +117,19 @@
       $for N in range(BATCH_TILE):
         float vf${N} = vy${N} / (vy${N} + vone);
 
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(BATCH_TILE):
+        if XNN_UNPREDICTABLE(vz${N} > vdenorm_cutoff) {
+          vf${N} = 0.0f;
+        }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       $for N in range(BATCH_TILE):
         if XNN_UNPREDICTABLE(vx${N} > 0.0f) {
           vf${N} = vone - vf${N};
         }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} > vone_cutoff) {
-          vf${N} = vone;
-        }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) {
-          vf${N} = 0.0f;
-        }
-
       $for N in range(BATCH_TILE):
         y[${N}] = vf${N};
       y += ${BATCH_TILE};
@@ -205,23 +196,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);
@@ -288,23 +273,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = vy / (vy + vone);
 
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y = vf;
     }
   $else:
@@ -370,23 +349,17 @@
         // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
         float vf = vy / (vy + vone);
 
+        // For inputs below denormal cutoff, replace output with +0.0f.
+        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+        if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+          vf = 0.0f;
+        }
+
         // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
         if XNN_UNPREDICTABLE(vx > 0.0f) {
           vf = vone - vf;
         }
 
-        // For inputs above 1.0 cutoff, replace output with 1.0.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-          vf = vone;
-        }
-
-        // For inputs below denormal cutoff, replace output with +0.0f.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-          vf = 0.0f;
-        }
-
         *y++ = vf;
 
         n -= sizeof(float);

diff --git a/src/f32-sigmoid/scalar-p5-div.c.in b/src/f32-sigmoid/scalar-p5-div.c.in
index 0d4e671..200cd27 100644
--- a/src/f32-sigmoid/scalar-p5-div.c.in
+++ b/src/f32-sigmoid/scalar-p5-div.c.in

@@ -4,7 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $assert BATCH_TILE >= 1
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 #include <math.h>
 
@@ -23,11 +23,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.8000FEp23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e = -0x1.715476p+0f;
   // Last 7 bits are zeroes
   const float vln2_hi = 0x1.62E400p-1f;
@@ -110,26 +108,19 @@
       $for N in range(BATCH_TILE):
         float vf${N} = ve${N} / (ve${N} + vone);
 
+      // For inputs below denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      $for N in range(BATCH_TILE):
+        if XNN_UNPREDICTABLE(vz${N} > vdenorm_cutoff) {
+          vf${N} = 0.0f;
+        }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       $for N in range(BATCH_TILE):
         if XNN_UNPREDICTABLE(vx${N} > 0.0f) {
           vf${N} = vone - vf${N};
         }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} > vone_cutoff) {
-          vf${N} = vone;
-        }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $for N in range(BATCH_TILE):
-        if XNN_UNPREDICTABLE(vx${N} < vdenorm_cutoff) {
-          vf${N} = 0.0f;
-        }
-
       $for N in range(BATCH_TILE):
         y[${N}] = vf${N};
       y += ${BATCH_TILE};
@@ -184,23 +175,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = ve / (ve + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y++ = vf;
 
       n -= sizeof(float);
@@ -255,23 +240,17 @@
       // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
       float vf = ve / (ve + vone);
 
+      // For inputs above denormal cutoff, replace output with +0.0f.
+      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+      if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+        vf = 0.0f;
+      }
+
       // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
       if XNN_UNPREDICTABLE(vx > 0.0f) {
         vf = vone - vf;
       }
 
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-        vf = vone;
-      }
-
-      // For inputs below denormal cutoff, replace output with +0.0f.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-        vf = 0.0f;
-      }
-
       *y = vf;
     }
   $else:
@@ -325,23 +304,17 @@
         // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
         float vf = ve / (ve + vone);
 
+        // For inputs above denormal cutoff, replace output with +0.0f.
+        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+        if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+          vf = 0.0f;
+        }
+
         // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
         if XNN_UNPREDICTABLE(vx > 0.0f) {
           vf = vone - vf;
         }
 
-        // For inputs above 1.0 cutoff, replace output with 1.0.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-          vf = vone;
-        }
-
-        // For inputs below denormal cutoff, replace output with +0.0f.
-        // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-        if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-          vf = 0.0f;
-        }
-
         *y++ = vf;
 
         n -= sizeof(float);

diff --git a/src/f32-sigmoid/sse-p5-div.c.in b/src/f32-sigmoid/sse-p5-div.c.in
index 4ae3897..c509e8d 100644
--- a/src/f32-sigmoid/sse-p5-div.c.in
+++ b/src/f32-sigmoid/sse-p5-div.c.in

@@ -5,7 +5,7 @@
 
 $assert BATCH_TILE % 4 == 0
 $assert BATCH_TILE >= 4
-$ABC = "0123456789ABCDEFGHIJKLMN"
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
 
 $if BLEND:
@@ -29,10 +29,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -44,116 +42,11 @@
   const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
   const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
 
-  for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
-    const __m128 vx${ABC[0:4]} = _mm_loadu_ps(x);
-    $for N in range(4, BATCH_TILE, 4):
-      const __m128 vx${ABC[N:N+4]} = _mm_loadu_ps(x + ${N});
-
-    // General structure of the algorithm:
-    //           / exp(x) / (1 + exp(x)) if x <= 0
-    //   f[x] := 
-    //           \ 1 - f[-x] if x >= 0
-    //
-    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
-    // then replace result with 1 - f[z] if x >= 0.
-    $for N in range(0, BATCH_TILE, 4):
-      const __m128 vz${ABC[N:N+4]} = _mm_or_ps(vx${ABC[N:N+4]}, vsign_mask);
-
-    // Compute reduced argument n := round(z / log(2)).
-    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
-    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
-    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
-    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
-    // the algorithm.
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vz${ABC[N:N+4]}, vlog2e), vmagic_bias);
-
-    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
-    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    $for N in range(0, BATCH_TILE, 4):
-      const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23));
-
-    // Subtract the large number back to get final n := round(z / log(2)).
-    $for N in range(0, BATCH_TILE, 4):
-      vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias);
-
-    // Compute reduced argument t := z - n * log(2).
-    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vz${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]});
-
-    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2);
-
-    $for N in range(0, BATCH_TILE, 4):
-      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1);
-
-    // Reconstruct the exp(z) value:
-    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
-    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
-    //     = s + (t * s) * p
-    $for N in range(0, BATCH_TILE, 4):
-      vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
-
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 ve${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]});
-
-    // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 vd${ABC[N:N+4]} = _mm_add_ps(ve${ABC[N:N+4]}, vone);
-
-    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    $for N in range(0, BATCH_TILE, 4):
-      __m128 vf${ABC[N:N+4]} = _mm_div_ps(ve${ABC[N:N+4]}, vd${ABC[N:N+4]});
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    $if BLEND:
-      $for N in range(0, BATCH_TILE, 4):
-        vf${ABC[N:N+4]} = _mm_blendv_ps(_mm_sub_ps(vone, vf${ABC[N:N+4]}), vf${ABC[N:N+4]}, vx${ABC[N:N+4]});
-    $else:
-      $for N in range(0, BATCH_TILE, 4):
-        __m128 vm${ABC[N:N+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx${ABC[N:N+4]})));
-
-      $for N in range(0, BATCH_TILE, 4):
-        vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vf${ABC[N:N+4]}, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, _mm_sub_ps(vone, vf${ABC[N:N+4]})));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    $if BLEND:
-      $for N in range(0, BATCH_TILE, 4):
-        vf${ABC[N:N+4]} = _mm_blendv_ps(vf${ABC[N:N+4]}, vone, _mm_cmpgt_ps(vx${ABC[N:N+4]}, vone_cutoff));
-    $else:
-      $for N in range(0, BATCH_TILE, 4):
-        vm${ABC[N:N+4]} = _mm_cmpgt_ps(vx${ABC[N:N+4]}, vone_cutoff);
-
-      $for N in range(0, BATCH_TILE, 4):
-        vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vone, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, vf${ABC[N:N+4]}));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    $for N in range(0, BATCH_TILE, 4):
-      vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vx${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]});
-
-    _mm_storeu_ps(y, vf${ABC[0:4]});
-    $for N in range(4, BATCH_TILE, 4):
-      _mm_storeu_ps(y + ${N}, vf${ABC[N:N+4]});
-
-    x += ${BATCH_TILE};
-    y += ${BATCH_TILE};
-  }
   $if BATCH_TILE > 4:
-    for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
-      const __m128 vx0123 = _mm_loadu_ps(x);
+    for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+      const __m128 vx${ABC[0:4]} = _mm_loadu_ps(x);
+      $for N in range(4, BATCH_TILE, 4):
+        const __m128 vx${ABC[N:N+4]} = _mm_loadu_ps(x + ${N});
 
       // General structure of the algorithm:
       //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -162,7 +55,8 @@
       //
       // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
       // then replace result with 1 - f[z] if x >= 0.
-      const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+      $for N in range(0, BATCH_TILE, 4):
+        const __m128 vz${ABC[N:N+4]} = _mm_or_ps(vx${ABC[N:N+4]}, vsign_mask);
 
       // Compute reduced argument n := round(z / log(2)).
       // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -170,65 +64,82 @@
       // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
       // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
       // the algorithm.
-      __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vz${ABC[N:N+4]}, vlog2e), vmagic_bias);
 
       // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
       // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-      const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+      $for N in range(0, BATCH_TILE, 4):
+        const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23));
 
       // Subtract the large number back to get final n := round(z / log(2)).
-      vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+      $for N in range(0, BATCH_TILE, 4):
+        vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias);
 
       // Compute reduced argument t := z - n * log(2).
       // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-      __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-      vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vz${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]});
 
       // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-      __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-      vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-      vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-      vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2);
+
+      $for N in range(0, BATCH_TILE, 4):
+        vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1);
 
       // Reconstruct the exp(z) value:
       //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
       //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
       //     = s + (t * s) * p
-      vt0123 = _mm_mul_ps(vt0123, vs0123);
-      __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+      $for N in range(0, BATCH_TILE, 4):
+        vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 ve${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]});
 
       // Denominator of the sigmoid fraction: 1.0 + exp(z)
-      __m128 vd0123 = _mm_add_ps(ve0123, vone);
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 vd${ABC[N:N+4]} = _mm_add_ps(ve${ABC[N:N+4]}, vone);
 
       // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-      __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-      // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-      $if BLEND:
-        vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-      $else:
-        __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-        vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-      // For inputs above 1.0 cutoff, replace output with 1.0.
-      // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      $if BLEND:
-        vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
-      $else:
-        vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-        vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+      $for N in range(0, BATCH_TILE, 4):
+        __m128 vf${ABC[N:N+4]} = _mm_div_ps(ve${ABC[N:N+4]}, vd${ABC[N:N+4]});
 
       // For inputs below denormal cutoff, replace output with +0.0f.
       // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-      vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+      $for N in range(0, BATCH_TILE, 4):
+        vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vz${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]});
 
-      _mm_storeu_ps(y, vf0123);
+      // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+      $if BLEND:
+        $for N in range(0, BATCH_TILE, 4):
+          vf${ABC[N:N+4]} = _mm_blendv_ps(_mm_sub_ps(vone, vf${ABC[N:N+4]}), vf${ABC[N:N+4]}, vx${ABC[N:N+4]});
+      $else:
+        $for N in range(0, BATCH_TILE, 4):
+          __m128 vm${ABC[N:N+4]} = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx${ABC[N:N+4]})));
 
-      x += 4;
-      y += 4;
+        $for N in range(0, BATCH_TILE, 4):
+          vf${ABC[N:N+4]} = _mm_or_ps(_mm_and_ps(vf${ABC[N:N+4]}, vm${ABC[N:N+4]}), _mm_andnot_ps(vm${ABC[N:N+4]}, _mm_sub_ps(vone, vf${ABC[N:N+4]})));
+
+      _mm_storeu_ps(y, vf${ABC[0:4]});
+      $for N in range(4, BATCH_TILE, 4):
+        _mm_storeu_ps(y + ${N}, vf${ABC[N:N+4]});
+
+      x += ${BATCH_TILE};
+      y += ${BATCH_TILE};
     }
-  if XNN_UNLIKELY(n != 0) {
-    const __m128 vx0123 = _mm_loadu_ps(x);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_loadu_ps(x);
 
     // General structure of the algorithm:
     //           / exp(x) / (1 + exp(x)) if x <= 0
@@ -237,7 +148,7 @@
     //
     // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
     // then replace result with 1 - f[z] if x >= 0.
-    const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
 
     // Compute reduced argument n := round(z / log(2)).
     // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
@@ -245,65 +156,124 @@
     // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
     // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
     // the algorithm.
-    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
 
     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
     // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
-    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
 
     // Subtract the large number back to get final n := round(z / log(2)).
-    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn = _mm_sub_ps(vn, vmagic_bias);
 
     // Compute reduced argument t := z - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
-    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
-    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
 
     // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
-    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
-    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
 
     // Reconstruct the exp(z) value:
     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
     //     = s + (t * s) * p
-    vt0123 = _mm_mul_ps(vt0123, vs0123);
-    __m128 ve0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
 
     // Denominator of the sigmoid fraction: 1.0 + exp(z)
-    __m128 vd0123 = _mm_add_ps(ve0123, vone);
+    __m128 vd = _mm_add_ps(ve, vone);
 
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
-    __m128 vf0123 = _mm_div_ps(ve0123, vd0123);
-
-    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
-    $if BLEND:
-      vf0123 = _mm_blendv_ps(_mm_sub_ps(vone, vf0123), vf0123, vx0123);
-    $else:
-      __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
-      vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
-
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    $if BLEND:
-      vf0123 = _mm_blendv_ps(vf0123, vone, _mm_cmpgt_ps(vx0123, vone_cutoff));
-    $else:
-      vm0123 = _mm_cmpgt_ps(vx0123, vone_cutoff);
-      vf0123 = _mm_or_ps(_mm_and_ps(vone, vm0123), _mm_andnot_ps(vm0123, vf0123));
+    __m128 vf = _mm_div_ps(ve, vd);
 
     // For inputs below denormal cutoff, replace output with +0.0f.
     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    $if BLEND:
+      vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+    $else:
+      __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+      vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
+
+    _mm_storeu_ps(y, vf);
+
+    x += 4;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const __m128 vx = _mm_loadu_ps(x);
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[z] := exp(z) / (1 + exp(z)) where z = -abs(x),
+    // then replace result with 1 - f[z] if x >= 0.
+    const __m128 vz = _mm_or_ps(vx, vsign_mask);
+
+    // Compute reduced argument n := round(z / log(2)).
+    // We do it by adding a large number (magic bias) to the product z * (1/log(2)), which cause rounding of the result
+    // to an integer, then subtracing the large number back. The trick with adding large number is valid only within
+    // certain bounds (|x| <= 2**22), but thats ok, because inputs x outside of [-87.336544, 17.328678] (i.e. z outsize
+    // [0, 87.336544]) underflow or saturate sigmoidf(x) anyway. We fixup the result for such inputs at the very end of
+    // the algorithm.
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= z <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final n := round(z / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the exp(z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 ve = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(z)
+    __m128 vd = _mm_add_ps(ve, vone);
+
+    // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+    __m128 vf = _mm_div_ps(ve, vd);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+    $if BLEND:
+      vf = _mm_blendv_ps(_mm_sub_ps(vone, vf), vf, vx);
+    $else:
+      __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
+      vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
 
     if (n & (2 * sizeof(float))) {
-      _mm_storel_pi((__m64*) y, vf0123);
-      vf0123 = _mm_movehl_ps(vf0123, vf0123);
+      _mm_storel_pi((__m64*) y, vf);
+      vf = _mm_movehl_ps(vf, vf);
       y += 2;
     }
     if (n & (1 * sizeof(float))) {
-      _mm_store_ss(y, vf0123);
+      _mm_store_ss(y, vf);
     }
   }
 }

diff --git a/src/init.c b/src/init.c
index c01d684..bee5fd4 100644
--- a/src/init.c
+++ b/src/init.c

@@ -1207,6 +1207,7 @@
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd;
     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__psimd_x8;
+    xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__psimd_p5_div_x16;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__psimd_2x8,
       .row_tile = 2,

diff --git a/src/math/sigmoid-neon-lut2048-p1-nr2recps.c b/src/math/sigmoid-neon-lut2048-p1-nr2recps.c
new file mode 100644
index 0000000..f4ea304
--- /dev/null
+++ b/src/math/sigmoid-neon-lut2048-p1-nr2recps.c

@@ -0,0 +1,637 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+// Table of exp2(k / 2048) values, k = 0..2047
+static const float exp2_k_over_2048_table[2048] = {
+  0x1.000000p+0f, 0x1.001630p+0f, 0x1.002C60p+0f, 0x1.004294p+0f,
+  0x1.0058C8p+0f, 0x1.006F00p+0f, 0x1.008538p+0f, 0x1.009B72p+0f,
+  0x1.00B1B0p+0f, 0x1.00C7EEp+0f, 0x1.00DE2Ep+0f, 0x1.00F472p+0f,
+  0x1.010AB6p+0f, 0x1.0120FCp+0f, 0x1.013744p+0f, 0x1.014D8Ep+0f,
+  0x1.0163DAp+0f, 0x1.017A28p+0f, 0x1.019078p+0f, 0x1.01A6CAp+0f,
+  0x1.01BD1Ep+0f, 0x1.01D374p+0f, 0x1.01E9CCp+0f, 0x1.020026p+0f,
+  0x1.021682p+0f, 0x1.022CDEp+0f, 0x1.02433Ep+0f, 0x1.0259A0p+0f,
+  0x1.027004p+0f, 0x1.028668p+0f, 0x1.029CD0p+0f, 0x1.02B338p+0f,
+  0x1.02C9A4p+0f, 0x1.02E010p+0f, 0x1.02F680p+0f, 0x1.030CF0p+0f,
+  0x1.032364p+0f, 0x1.0339D8p+0f, 0x1.035050p+0f, 0x1.0366C8p+0f,
+  0x1.037D42p+0f, 0x1.0393C0p+0f, 0x1.03AA3Ep+0f, 0x1.03C0BEp+0f,
+  0x1.03D742p+0f, 0x1.03EDC6p+0f, 0x1.04044Cp+0f, 0x1.041AD4p+0f,
+  0x1.04315Ep+0f, 0x1.0447EAp+0f, 0x1.045E78p+0f, 0x1.04750Ap+0f,
+  0x1.048B9Cp+0f, 0x1.04A230p+0f, 0x1.04B8C6p+0f, 0x1.04CF5Ep+0f,
+  0x1.04E5F8p+0f, 0x1.04FC94p+0f, 0x1.051330p+0f, 0x1.0529D0p+0f,
+  0x1.054072p+0f, 0x1.055716p+0f, 0x1.056DBCp+0f, 0x1.058464p+0f,
+  0x1.059B0Ep+0f, 0x1.05B1B8p+0f, 0x1.05C866p+0f, 0x1.05DF16p+0f,
+  0x1.05F5C8p+0f, 0x1.060C7Ap+0f, 0x1.062330p+0f, 0x1.0639E8p+0f,
+  0x1.0650A0p+0f, 0x1.06675Cp+0f, 0x1.067E1Ap+0f, 0x1.0694D8p+0f,
+  0x1.06AB9Ap+0f, 0x1.06C25Ep+0f, 0x1.06D922p+0f, 0x1.06EFEAp+0f,
+  0x1.0706B2p+0f, 0x1.071D7Ep+0f, 0x1.07344Ap+0f, 0x1.074B1Ap+0f,
+  0x1.0761EAp+0f, 0x1.0778BEp+0f, 0x1.078F92p+0f, 0x1.07A66Ap+0f,
+  0x1.07BD42p+0f, 0x1.07D41Ep+0f, 0x1.07EAFAp+0f, 0x1.0801DAp+0f,
+  0x1.0818BAp+0f, 0x1.082F9Ep+0f, 0x1.084682p+0f, 0x1.085D68p+0f,
+  0x1.087452p+0f, 0x1.088B3Cp+0f, 0x1.08A22Ap+0f, 0x1.08B918p+0f,
+  0x1.08D008p+0f, 0x1.08E6FCp+0f, 0x1.08FDF0p+0f, 0x1.0914E6p+0f,
+  0x1.092BE0p+0f, 0x1.0942DAp+0f, 0x1.0959D6p+0f, 0x1.0970D6p+0f,
+  0x1.0987D6p+0f, 0x1.099ED8p+0f, 0x1.09B5DEp+0f, 0x1.09CCE4p+0f,
+  0x1.09E3ECp+0f, 0x1.09FAF8p+0f, 0x1.0A1204p+0f, 0x1.0A2912p+0f,
+  0x1.0A4024p+0f, 0x1.0A5736p+0f, 0x1.0A6E4Ap+0f, 0x1.0A8562p+0f,
+  0x1.0A9C7Ap+0f, 0x1.0AB394p+0f, 0x1.0ACAB0p+0f, 0x1.0AE1D0p+0f,
+  0x1.0AF8F0p+0f, 0x1.0B1012p+0f, 0x1.0B2738p+0f, 0x1.0B3E5Ep+0f,
+  0x1.0B5586p+0f, 0x1.0B6CB2p+0f, 0x1.0B83DEp+0f, 0x1.0B9B0Cp+0f,
+  0x1.0BB23Ep+0f, 0x1.0BC970p+0f, 0x1.0BE0A4p+0f, 0x1.0BF7DCp+0f,
+  0x1.0C0F14p+0f, 0x1.0C2650p+0f, 0x1.0C3D8Cp+0f, 0x1.0C54CAp+0f,
+  0x1.0C6C0Cp+0f, 0x1.0C834Ep+0f, 0x1.0C9A94p+0f, 0x1.0CB1DAp+0f,
+  0x1.0CC922p+0f, 0x1.0CE06Ep+0f, 0x1.0CF7BAp+0f, 0x1.0D0F0Ap+0f,
+  0x1.0D265Ap+0f, 0x1.0D3DAEp+0f, 0x1.0D5502p+0f, 0x1.0D6C5Ap+0f,
+  0x1.0D83B2p+0f, 0x1.0D9B0Ep+0f, 0x1.0DB26Ap+0f, 0x1.0DC9CAp+0f,
+  0x1.0DE12Ap+0f, 0x1.0DF88Ep+0f, 0x1.0E0FF2p+0f, 0x1.0E275Ap+0f,
+  0x1.0E3EC4p+0f, 0x1.0E562Ep+0f, 0x1.0E6D9Cp+0f, 0x1.0E850Ap+0f,
+  0x1.0E9C7Cp+0f, 0x1.0EB3F0p+0f, 0x1.0ECB66p+0f, 0x1.0EE2DCp+0f,
+  0x1.0EFA56p+0f, 0x1.0F11D2p+0f, 0x1.0F2950p+0f, 0x1.0F40CEp+0f,
+  0x1.0F5850p+0f, 0x1.0F6FD4p+0f, 0x1.0F875Ap+0f, 0x1.0F9EE2p+0f,
+  0x1.0FB66Ap+0f, 0x1.0FCDF6p+0f, 0x1.0FE584p+0f, 0x1.0FFD14p+0f,
+  0x1.1014A6p+0f, 0x1.102C3Ap+0f, 0x1.1043D0p+0f, 0x1.105B68p+0f,
+  0x1.107302p+0f, 0x1.108A9Ep+0f, 0x1.10A23Cp+0f, 0x1.10B9DEp+0f,
+  0x1.10D180p+0f, 0x1.10E924p+0f, 0x1.1100CAp+0f, 0x1.111872p+0f,
+  0x1.11301Ep+0f, 0x1.1147CAp+0f, 0x1.115F78p+0f, 0x1.117728p+0f,
+  0x1.118EDCp+0f, 0x1.11A690p+0f, 0x1.11BE46p+0f, 0x1.11D600p+0f,
+  0x1.11EDBAp+0f, 0x1.120578p+0f, 0x1.121D36p+0f, 0x1.1234F8p+0f,
+  0x1.124CBAp+0f, 0x1.126480p+0f, 0x1.127C48p+0f, 0x1.129410p+0f,
+  0x1.12ABDCp+0f, 0x1.12C3AAp+0f, 0x1.12DB78p+0f, 0x1.12F34Ap+0f,
+  0x1.130B1Ep+0f, 0x1.1322F4p+0f, 0x1.133ACCp+0f, 0x1.1352A6p+0f,
+  0x1.136A82p+0f, 0x1.138260p+0f, 0x1.139A40p+0f, 0x1.13B222p+0f,
+  0x1.13CA06p+0f, 0x1.13E1ECp+0f, 0x1.13F9D4p+0f, 0x1.1411BEp+0f,
+  0x1.1429AAp+0f, 0x1.14419Ap+0f, 0x1.14598Ap+0f, 0x1.14717Cp+0f,
+  0x1.148972p+0f, 0x1.14A168p+0f, 0x1.14B962p+0f, 0x1.14D15Cp+0f,
+  0x1.14E95Ap+0f, 0x1.150158p+0f, 0x1.15195Ap+0f, 0x1.15315Cp+0f,
+  0x1.154962p+0f, 0x1.15616Ap+0f, 0x1.157974p+0f, 0x1.15917Ep+0f,
+  0x1.15A98Cp+0f, 0x1.15C19Cp+0f, 0x1.15D9AEp+0f, 0x1.15F1C2p+0f,
+  0x1.1609D8p+0f, 0x1.1621F0p+0f, 0x1.163A0Ap+0f, 0x1.165226p+0f,
+  0x1.166A46p+0f, 0x1.168266p+0f, 0x1.169A88p+0f, 0x1.16B2AEp+0f,
+  0x1.16CAD4p+0f, 0x1.16E2FCp+0f, 0x1.16FB28p+0f, 0x1.171354p+0f,
+  0x1.172B84p+0f, 0x1.1743B6p+0f, 0x1.175BE8p+0f, 0x1.17741Ep+0f,
+  0x1.178C56p+0f, 0x1.17A48Ep+0f, 0x1.17BCCAp+0f, 0x1.17D508p+0f,
+  0x1.17ED48p+0f, 0x1.18058Ap+0f, 0x1.181DCEp+0f, 0x1.183614p+0f,
+  0x1.184E5Ep+0f, 0x1.1866A8p+0f, 0x1.187EF4p+0f, 0x1.189742p+0f,
+  0x1.18AF94p+0f, 0x1.18C7E6p+0f, 0x1.18E03Cp+0f, 0x1.18F892p+0f,
+  0x1.1910ECp+0f, 0x1.192946p+0f, 0x1.1941A4p+0f, 0x1.195A04p+0f,
+  0x1.197266p+0f, 0x1.198ACAp+0f, 0x1.19A330p+0f, 0x1.19BB98p+0f,
+  0x1.19D402p+0f, 0x1.19EC6Ep+0f, 0x1.1A04DCp+0f, 0x1.1A1D4Cp+0f,
+  0x1.1A35BEp+0f, 0x1.1A4E34p+0f, 0x1.1A66AAp+0f, 0x1.1A7F24p+0f,
+  0x1.1A979Ep+0f, 0x1.1AB01Cp+0f, 0x1.1AC89Ap+0f, 0x1.1AE11Cp+0f,
+  0x1.1AF9A0p+0f, 0x1.1B1226p+0f, 0x1.1B2AACp+0f, 0x1.1B4336p+0f,
+  0x1.1B5BC2p+0f, 0x1.1B7452p+0f, 0x1.1B8CE2p+0f, 0x1.1BA574p+0f,
+  0x1.1BBE08p+0f, 0x1.1BD69Ep+0f, 0x1.1BEF38p+0f, 0x1.1C07D2p+0f,
+  0x1.1C2070p+0f, 0x1.1C390Ep+0f, 0x1.1C51B0p+0f, 0x1.1C6A54p+0f,
+  0x1.1C82FAp+0f, 0x1.1C9BA2p+0f, 0x1.1CB44Ap+0f, 0x1.1CCCF6p+0f,
+  0x1.1CE5A6p+0f, 0x1.1CFE56p+0f, 0x1.1D1708p+0f, 0x1.1D2FBCp+0f,
+  0x1.1D4874p+0f, 0x1.1D612Cp+0f, 0x1.1D79E6p+0f, 0x1.1D92A4p+0f,
+  0x1.1DAB64p+0f, 0x1.1DC424p+0f, 0x1.1DDCE8p+0f, 0x1.1DF5AEp+0f,
+  0x1.1E0E76p+0f, 0x1.1E2740p+0f, 0x1.1E400Cp+0f, 0x1.1E58DAp+0f,
+  0x1.1E71AAp+0f, 0x1.1E8A7Ep+0f, 0x1.1EA352p+0f, 0x1.1EBC2Ap+0f,
+  0x1.1ED502p+0f, 0x1.1EEDDEp+0f, 0x1.1F06BAp+0f, 0x1.1F1F9Ap+0f,
+  0x1.1F387Cp+0f, 0x1.1F5160p+0f, 0x1.1F6A46p+0f, 0x1.1F832Ep+0f,
+  0x1.1F9C18p+0f, 0x1.1FB504p+0f, 0x1.1FCDF4p+0f, 0x1.1FE6E4p+0f,
+  0x1.1FFFD8p+0f, 0x1.2018CCp+0f, 0x1.2031C4p+0f, 0x1.204ABCp+0f,
+  0x1.2063B8p+0f, 0x1.207CB6p+0f, 0x1.2095B6p+0f, 0x1.20AEB8p+0f,
+  0x1.20C7BCp+0f, 0x1.20E0C4p+0f, 0x1.20F9CCp+0f, 0x1.2112D6p+0f,
+  0x1.212BE4p+0f, 0x1.2144F2p+0f, 0x1.215E04p+0f, 0x1.217718p+0f,
+  0x1.21902Cp+0f, 0x1.21A944p+0f, 0x1.21C25Ep+0f, 0x1.21DB7Ap+0f,
+  0x1.21F49Ap+0f, 0x1.220DBAp+0f, 0x1.2226DCp+0f, 0x1.224002p+0f,
+  0x1.225928p+0f, 0x1.227252p+0f, 0x1.228B7Cp+0f, 0x1.22A4AAp+0f,
+  0x1.22BDDAp+0f, 0x1.22D70Cp+0f, 0x1.22F040p+0f, 0x1.230976p+0f,
+  0x1.2322B0p+0f, 0x1.233BEAp+0f, 0x1.235526p+0f, 0x1.236E66p+0f,
+  0x1.2387A6p+0f, 0x1.23A0EAp+0f, 0x1.23BA30p+0f, 0x1.23D378p+0f,
+  0x1.23ECC2p+0f, 0x1.24060Ep+0f, 0x1.241F5Cp+0f, 0x1.2438ACp+0f,
+  0x1.245200p+0f, 0x1.246B54p+0f, 0x1.2484ACp+0f, 0x1.249E06p+0f,
+  0x1.24B760p+0f, 0x1.24D0BEp+0f, 0x1.24EA1Ep+0f, 0x1.250380p+0f,
+  0x1.251CE4p+0f, 0x1.25364Cp+0f, 0x1.254FB4p+0f, 0x1.256920p+0f,
+  0x1.25828Cp+0f, 0x1.259BFCp+0f, 0x1.25B56Ep+0f, 0x1.25CEE2p+0f,
+  0x1.25E858p+0f, 0x1.2601D0p+0f, 0x1.261B4Ap+0f, 0x1.2634C6p+0f,
+  0x1.264E46p+0f, 0x1.2667C6p+0f, 0x1.26814Ap+0f, 0x1.269ACEp+0f,
+  0x1.26B456p+0f, 0x1.26CDE0p+0f, 0x1.26E76Cp+0f, 0x1.2700FAp+0f,
+  0x1.271A8Cp+0f, 0x1.27341Ep+0f, 0x1.274DB2p+0f, 0x1.27674Ap+0f,
+  0x1.2780E4p+0f, 0x1.279A7Ep+0f, 0x1.27B41Cp+0f, 0x1.27CDBCp+0f,
+  0x1.27E75Ep+0f, 0x1.280104p+0f, 0x1.281AAAp+0f, 0x1.283452p+0f,
+  0x1.284DFEp+0f, 0x1.2867ACp+0f, 0x1.28815Cp+0f, 0x1.289B0Cp+0f,
+  0x1.28B4C0p+0f, 0x1.28CE78p+0f, 0x1.28E830p+0f, 0x1.2901EAp+0f,
+  0x1.291BA8p+0f, 0x1.293566p+0f, 0x1.294F28p+0f, 0x1.2968ECp+0f,
+  0x1.2982B2p+0f, 0x1.299C7Ap+0f, 0x1.29B644p+0f, 0x1.29D010p+0f,
+  0x1.29E9E0p+0f, 0x1.2A03B0p+0f, 0x1.2A1D84p+0f, 0x1.2A375Ap+0f,
+  0x1.2A5130p+0f, 0x1.2A6B0Ap+0f, 0x1.2A84E8p+0f, 0x1.2A9EC6p+0f,
+  0x1.2AB8A6p+0f, 0x1.2AD28Ap+0f, 0x1.2AEC6Ep+0f, 0x1.2B0656p+0f,
+  0x1.2B2040p+0f, 0x1.2B3A2Cp+0f, 0x1.2B541Ap+0f, 0x1.2B6E0Ap+0f,
+  0x1.2B87FEp+0f, 0x1.2BA1F2p+0f, 0x1.2BBBEAp+0f, 0x1.2BD5E2p+0f,
+  0x1.2BEFDEp+0f, 0x1.2C09DCp+0f, 0x1.2C23DCp+0f, 0x1.2C3DDEp+0f,
+  0x1.2C57E4p+0f, 0x1.2C71EAp+0f, 0x1.2C8BF4p+0f, 0x1.2CA600p+0f,
+  0x1.2CC00Cp+0f, 0x1.2CDA1Cp+0f, 0x1.2CF430p+0f, 0x1.2D0E44p+0f,
+  0x1.2D285Ap+0f, 0x1.2D4274p+0f, 0x1.2D5C8Ep+0f, 0x1.2D76ACp+0f,
+  0x1.2D90CCp+0f, 0x1.2DAAEEp+0f, 0x1.2DC512p+0f, 0x1.2DDF3Ap+0f,
+  0x1.2DF962p+0f, 0x1.2E138Ep+0f, 0x1.2E2DBAp+0f, 0x1.2E47EAp+0f,
+  0x1.2E621Cp+0f, 0x1.2E7C50p+0f, 0x1.2E9686p+0f, 0x1.2EB0C0p+0f,
+  0x1.2ECAFAp+0f, 0x1.2EE538p+0f, 0x1.2EFF78p+0f, 0x1.2F19BAp+0f,
+  0x1.2F33FEp+0f, 0x1.2F4E44p+0f, 0x1.2F688Cp+0f, 0x1.2F82D8p+0f,
+  0x1.2F9D24p+0f, 0x1.2FB774p+0f, 0x1.2FD1C6p+0f, 0x1.2FEC1Ap+0f,
+  0x1.300670p+0f, 0x1.3020CAp+0f, 0x1.303B24p+0f, 0x1.305582p+0f,
+  0x1.306FE0p+0f, 0x1.308A42p+0f, 0x1.30A4A6p+0f, 0x1.30BF0Cp+0f,
+  0x1.30D976p+0f, 0x1.30F3E0p+0f, 0x1.310E4Ep+0f, 0x1.3128BEp+0f,
+  0x1.31432Ep+0f, 0x1.315DA2p+0f, 0x1.31781Ap+0f, 0x1.319292p+0f,
+  0x1.31AD0Cp+0f, 0x1.31C78Ap+0f, 0x1.31E20Ap+0f, 0x1.31FC8Cp+0f,
+  0x1.321710p+0f, 0x1.323196p+0f, 0x1.324C1Ep+0f, 0x1.3266AAp+0f,
+  0x1.328138p+0f, 0x1.329BC6p+0f, 0x1.32B658p+0f, 0x1.32D0EEp+0f,
+  0x1.32EB84p+0f, 0x1.33061Cp+0f, 0x1.3320B8p+0f, 0x1.333B56p+0f,
+  0x1.3355F4p+0f, 0x1.337098p+0f, 0x1.338B3Cp+0f, 0x1.33A5E2p+0f,
+  0x1.33C08Cp+0f, 0x1.33DB36p+0f, 0x1.33F5E4p+0f, 0x1.341094p+0f,
+  0x1.342B46p+0f, 0x1.3445FAp+0f, 0x1.3460B2p+0f, 0x1.347B6Ap+0f,
+  0x1.349626p+0f, 0x1.34B0E4p+0f, 0x1.34CBA4p+0f, 0x1.34E666p+0f,
+  0x1.35012Cp+0f, 0x1.351BF2p+0f, 0x1.3536BCp+0f, 0x1.355188p+0f,
+  0x1.356C56p+0f, 0x1.358726p+0f, 0x1.35A1FAp+0f, 0x1.35BCCEp+0f,
+  0x1.35D7A6p+0f, 0x1.35F280p+0f, 0x1.360D5Cp+0f, 0x1.36283Ap+0f,
+  0x1.36431Ap+0f, 0x1.365DFEp+0f, 0x1.3678E2p+0f, 0x1.3693CAp+0f,
+  0x1.36AEB4p+0f, 0x1.36C9A0p+0f, 0x1.36E490p+0f, 0x1.36FF80p+0f,
+  0x1.371A74p+0f, 0x1.37356Ap+0f, 0x1.375062p+0f, 0x1.376B5Cp+0f,
+  0x1.378658p+0f, 0x1.37A158p+0f, 0x1.37BC58p+0f, 0x1.37D75Cp+0f,
+  0x1.37F262p+0f, 0x1.380D6Ap+0f, 0x1.382876p+0f, 0x1.384382p+0f,
+  0x1.385E92p+0f, 0x1.3879A4p+0f, 0x1.3894B8p+0f, 0x1.38AFCEp+0f,
+  0x1.38CAE6p+0f, 0x1.38E602p+0f, 0x1.390120p+0f, 0x1.391C40p+0f,
+  0x1.393762p+0f, 0x1.395286p+0f, 0x1.396DACp+0f, 0x1.3988D6p+0f,
+  0x1.39A402p+0f, 0x1.39BF30p+0f, 0x1.39DA60p+0f, 0x1.39F592p+0f,
+  0x1.3A10C8p+0f, 0x1.3A2C00p+0f, 0x1.3A4738p+0f, 0x1.3A6274p+0f,
+  0x1.3A7DB4p+0f, 0x1.3A98F4p+0f, 0x1.3AB438p+0f, 0x1.3ACF7Cp+0f,
+  0x1.3AEAC4p+0f, 0x1.3B0610p+0f, 0x1.3B215Cp+0f, 0x1.3B3CAAp+0f,
+  0x1.3B57FCp+0f, 0x1.3B7350p+0f, 0x1.3B8EA6p+0f, 0x1.3BA9FEp+0f,
+  0x1.3BC55Ap+0f, 0x1.3BE0B6p+0f, 0x1.3BFC16p+0f, 0x1.3C1778p+0f,
+  0x1.3C32DCp+0f, 0x1.3C4E42p+0f, 0x1.3C69ACp+0f, 0x1.3C8518p+0f,
+  0x1.3CA086p+0f, 0x1.3CBBF6p+0f, 0x1.3CD768p+0f, 0x1.3CF2DCp+0f,
+  0x1.3D0E54p+0f, 0x1.3D29CEp+0f, 0x1.3D454Ap+0f, 0x1.3D60C8p+0f,
+  0x1.3D7C4Ap+0f, 0x1.3D97CCp+0f, 0x1.3DB352p+0f, 0x1.3DCEDAp+0f,
+  0x1.3DEA64p+0f, 0x1.3E05F2p+0f, 0x1.3E2180p+0f, 0x1.3E3D12p+0f,
+  0x1.3E58A6p+0f, 0x1.3E743Cp+0f, 0x1.3E8FD6p+0f, 0x1.3EAB70p+0f,
+  0x1.3EC70Ep+0f, 0x1.3EE2AEp+0f, 0x1.3EFE50p+0f, 0x1.3F19F4p+0f,
+  0x1.3F359Cp+0f, 0x1.3F5146p+0f, 0x1.3F6CF2p+0f, 0x1.3F88A0p+0f,
+  0x1.3FA450p+0f, 0x1.3FC004p+0f, 0x1.3FDBB8p+0f, 0x1.3FF770p+0f,
+  0x1.40132Cp+0f, 0x1.402EE8p+0f, 0x1.404AA6p+0f, 0x1.406668p+0f,
+  0x1.40822Cp+0f, 0x1.409DF2p+0f, 0x1.40B9BCp+0f, 0x1.40D586p+0f,
+  0x1.40F154p+0f, 0x1.410D24p+0f, 0x1.4128F6p+0f, 0x1.4144CAp+0f,
+  0x1.4160A2p+0f, 0x1.417C7Cp+0f, 0x1.419858p+0f, 0x1.41B436p+0f,
+  0x1.41D016p+0f, 0x1.41EBFAp+0f, 0x1.4207E0p+0f, 0x1.4223C8p+0f,
+  0x1.423FB2p+0f, 0x1.425BA0p+0f, 0x1.42778Ep+0f, 0x1.429380p+0f,
+  0x1.42AF74p+0f, 0x1.42CB6Cp+0f, 0x1.42E764p+0f, 0x1.430360p+0f,
+  0x1.431F5Ep+0f, 0x1.433B5Ep+0f, 0x1.435760p+0f, 0x1.437366p+0f,
+  0x1.438F6Ep+0f, 0x1.43AB78p+0f, 0x1.43C784p+0f, 0x1.43E392p+0f,
+  0x1.43FFA4p+0f, 0x1.441BB8p+0f, 0x1.4437CEp+0f, 0x1.4453E6p+0f,
+  0x1.447002p+0f, 0x1.448C1Ep+0f, 0x1.44A83Ep+0f, 0x1.44C462p+0f,
+  0x1.44E086p+0f, 0x1.44FCAEp+0f, 0x1.4518D6p+0f, 0x1.453504p+0f,
+  0x1.455132p+0f, 0x1.456D62p+0f, 0x1.458996p+0f, 0x1.45A5CCp+0f,
+  0x1.45C204p+0f, 0x1.45DE3Ep+0f, 0x1.45FA7Cp+0f, 0x1.4616BCp+0f,
+  0x1.4632FEp+0f, 0x1.464F42p+0f, 0x1.466B8Ap+0f, 0x1.4687D2p+0f,
+  0x1.46A41Ep+0f, 0x1.46C06Ep+0f, 0x1.46DCBEp+0f, 0x1.46F912p+0f,
+  0x1.471566p+0f, 0x1.4731C0p+0f, 0x1.474E1Ap+0f, 0x1.476A76p+0f,
+  0x1.4786D6p+0f, 0x1.47A338p+0f, 0x1.47BF9Cp+0f, 0x1.47DC04p+0f,
+  0x1.47F86Ep+0f, 0x1.4814DAp+0f, 0x1.483148p+0f, 0x1.484DB8p+0f,
+  0x1.486A2Cp+0f, 0x1.4886A2p+0f, 0x1.48A31Ap+0f, 0x1.48BF94p+0f,
+  0x1.48DC10p+0f, 0x1.48F890p+0f, 0x1.491512p+0f, 0x1.493198p+0f,
+  0x1.494E1Ep+0f, 0x1.496AA8p+0f, 0x1.498734p+0f, 0x1.49A3C2p+0f,
+  0x1.49C052p+0f, 0x1.49DCE6p+0f, 0x1.49F97Cp+0f, 0x1.4A1614p+0f,
+  0x1.4A32B0p+0f, 0x1.4A4F4Cp+0f, 0x1.4A6BECp+0f, 0x1.4A888Ep+0f,
+  0x1.4AA532p+0f, 0x1.4AC1DAp+0f, 0x1.4ADE84p+0f, 0x1.4AFB30p+0f,
+  0x1.4B17DEp+0f, 0x1.4B3490p+0f, 0x1.4B5144p+0f, 0x1.4B6DFAp+0f,
+  0x1.4B8AB2p+0f, 0x1.4BA76Ep+0f, 0x1.4BC42Ap+0f, 0x1.4BE0EAp+0f,
+  0x1.4BFDAEp+0f, 0x1.4C1A72p+0f, 0x1.4C373Ap+0f, 0x1.4C5404p+0f,
+  0x1.4C70D0p+0f, 0x1.4C8DA0p+0f, 0x1.4CAA70p+0f, 0x1.4CC744p+0f,
+  0x1.4CE41Cp+0f, 0x1.4D00F4p+0f, 0x1.4D1DD0p+0f, 0x1.4D3AAEp+0f,
+  0x1.4D578Ep+0f, 0x1.4D7472p+0f, 0x1.4D9158p+0f, 0x1.4DAE40p+0f,
+  0x1.4DCB2Ap+0f, 0x1.4DE816p+0f, 0x1.4E0506p+0f, 0x1.4E21F8p+0f,
+  0x1.4E3EECp+0f, 0x1.4E5BE4p+0f, 0x1.4E78DEp+0f, 0x1.4E95DAp+0f,
+  0x1.4EB2D8p+0f, 0x1.4ECFDAp+0f, 0x1.4EECDCp+0f, 0x1.4F09E2p+0f,
+  0x1.4F26ECp+0f, 0x1.4F43F6p+0f, 0x1.4F6104p+0f, 0x1.4F7E14p+0f,
+  0x1.4F9B28p+0f, 0x1.4FB83Cp+0f, 0x1.4FD554p+0f, 0x1.4FF26Ep+0f,
+  0x1.500F8Cp+0f, 0x1.502CAAp+0f, 0x1.5049CCp+0f, 0x1.5066F2p+0f,
+  0x1.508418p+0f, 0x1.50A142p+0f, 0x1.50BE6Ep+0f, 0x1.50DB9Cp+0f,
+  0x1.50F8CCp+0f, 0x1.511600p+0f, 0x1.513336p+0f, 0x1.515070p+0f,
+  0x1.516DAAp+0f, 0x1.518AE8p+0f, 0x1.51A828p+0f, 0x1.51C56Ap+0f,
+  0x1.51E2B0p+0f, 0x1.51FFF8p+0f, 0x1.521D42p+0f, 0x1.523A90p+0f,
+  0x1.5257DEp+0f, 0x1.527530p+0f, 0x1.529284p+0f, 0x1.52AFDCp+0f,
+  0x1.52CD36p+0f, 0x1.52EA92p+0f, 0x1.5307F0p+0f, 0x1.532552p+0f,
+  0x1.5342B6p+0f, 0x1.53601Cp+0f, 0x1.537D84p+0f, 0x1.539AF0p+0f,
+  0x1.53B85Ep+0f, 0x1.53D5CEp+0f, 0x1.53F342p+0f, 0x1.5410B8p+0f,
+  0x1.542E30p+0f, 0x1.544BAAp+0f, 0x1.546928p+0f, 0x1.5486A8p+0f,
+  0x1.54A42Ap+0f, 0x1.54C1AEp+0f, 0x1.54DF36p+0f, 0x1.54FCC0p+0f,
+  0x1.551A4Cp+0f, 0x1.5537DCp+0f, 0x1.55556Ep+0f, 0x1.557302p+0f,
+  0x1.559098p+0f, 0x1.55AE32p+0f, 0x1.55CBCEp+0f, 0x1.55E96Cp+0f,
+  0x1.56070Ep+0f, 0x1.5624B2p+0f, 0x1.564258p+0f, 0x1.566000p+0f,
+  0x1.567DACp+0f, 0x1.569B5Ap+0f, 0x1.56B90Ap+0f, 0x1.56D6BEp+0f,
+  0x1.56F474p+0f, 0x1.57122Cp+0f, 0x1.572FE6p+0f, 0x1.574DA4p+0f,
+  0x1.576B64p+0f, 0x1.578926p+0f, 0x1.57A6ECp+0f, 0x1.57C4B4p+0f,
+  0x1.57E27Ep+0f, 0x1.58004Ap+0f, 0x1.581E1Ap+0f, 0x1.583BECp+0f,
+  0x1.5859C0p+0f, 0x1.587798p+0f, 0x1.589572p+0f, 0x1.58B34Ep+0f,
+  0x1.58D12Ep+0f, 0x1.58EF0Ep+0f, 0x1.590CF4p+0f, 0x1.592ADAp+0f,
+  0x1.5948C4p+0f, 0x1.5966B0p+0f, 0x1.59849Ep+0f, 0x1.59A28Ep+0f,
+  0x1.59C082p+0f, 0x1.59DE78p+0f, 0x1.59FC72p+0f, 0x1.5A1A6Ep+0f,
+  0x1.5A386Cp+0f, 0x1.5A566Cp+0f, 0x1.5A7470p+0f, 0x1.5A9276p+0f,
+  0x1.5AB07Ep+0f, 0x1.5ACE88p+0f, 0x1.5AEC96p+0f, 0x1.5B0AA6p+0f,
+  0x1.5B28BAp+0f, 0x1.5B46D0p+0f, 0x1.5B64E8p+0f, 0x1.5B8302p+0f,
+  0x1.5BA120p+0f, 0x1.5BBF40p+0f, 0x1.5BDD62p+0f, 0x1.5BFB88p+0f,
+  0x1.5C19B0p+0f, 0x1.5C37DAp+0f, 0x1.5C5606p+0f, 0x1.5C7436p+0f,
+  0x1.5C9268p+0f, 0x1.5CB09Ep+0f, 0x1.5CCED6p+0f, 0x1.5CED10p+0f,
+  0x1.5D0B4Cp+0f, 0x1.5D298Cp+0f, 0x1.5D47CEp+0f, 0x1.5D6612p+0f,
+  0x1.5D845Ap+0f, 0x1.5DA2A2p+0f, 0x1.5DC0F0p+0f, 0x1.5DDF3Ep+0f,
+  0x1.5DFD90p+0f, 0x1.5E1BE4p+0f, 0x1.5E3A3Cp+0f, 0x1.5E5896p+0f,
+  0x1.5E76F2p+0f, 0x1.5E9550p+0f, 0x1.5EB3B2p+0f, 0x1.5ED216p+0f,
+  0x1.5EF07Cp+0f, 0x1.5F0EE6p+0f, 0x1.5F2D52p+0f, 0x1.5F4BC0p+0f,
+  0x1.5F6A32p+0f, 0x1.5F88A6p+0f, 0x1.5FA71Cp+0f, 0x1.5FC596p+0f,
+  0x1.5FE412p+0f, 0x1.600290p+0f, 0x1.602112p+0f, 0x1.603F96p+0f,
+  0x1.605E1Cp+0f, 0x1.607CA4p+0f, 0x1.609B30p+0f, 0x1.60B9BEp+0f,
+  0x1.60D850p+0f, 0x1.60F6E4p+0f, 0x1.61157Ap+0f, 0x1.613412p+0f,
+  0x1.6152AEp+0f, 0x1.61714Cp+0f, 0x1.618FEEp+0f, 0x1.61AE92p+0f,
+  0x1.61CD38p+0f, 0x1.61EBE0p+0f, 0x1.620A8Cp+0f, 0x1.62293Ap+0f,
+  0x1.6247ECp+0f, 0x1.62669Ep+0f, 0x1.628554p+0f, 0x1.62A40Ep+0f,
+  0x1.62C2CAp+0f, 0x1.62E188p+0f, 0x1.630048p+0f, 0x1.631F0Cp+0f,
+  0x1.633DD2p+0f, 0x1.635C9Ap+0f, 0x1.637B66p+0f, 0x1.639A34p+0f,
+  0x1.63B906p+0f, 0x1.63D7D8p+0f, 0x1.63F6AEp+0f, 0x1.641588p+0f,
+  0x1.643464p+0f, 0x1.645342p+0f, 0x1.647222p+0f, 0x1.649106p+0f,
+  0x1.64AFECp+0f, 0x1.64CED6p+0f, 0x1.64EDC0p+0f, 0x1.650CAEp+0f,
+  0x1.652BA0p+0f, 0x1.654A94p+0f, 0x1.65698Ap+0f, 0x1.658882p+0f,
+  0x1.65A77Ep+0f, 0x1.65C67Cp+0f, 0x1.65E57Ep+0f, 0x1.660482p+0f,
+  0x1.662388p+0f, 0x1.664292p+0f, 0x1.66619Ep+0f, 0x1.6680ACp+0f,
+  0x1.669FBCp+0f, 0x1.66BED0p+0f, 0x1.66DDE8p+0f, 0x1.66FD00p+0f,
+  0x1.671C1Cp+0f, 0x1.673B3Cp+0f, 0x1.675A5Cp+0f, 0x1.677980p+0f,
+  0x1.6798A8p+0f, 0x1.67B7D0p+0f, 0x1.67D6FCp+0f, 0x1.67F62Cp+0f,
+  0x1.68155Ep+0f, 0x1.683492p+0f, 0x1.6853C8p+0f, 0x1.687302p+0f,
+  0x1.68923Ep+0f, 0x1.68B17Ep+0f, 0x1.68D0C0p+0f, 0x1.68F004p+0f,
+  0x1.690F4Cp+0f, 0x1.692E96p+0f, 0x1.694DE2p+0f, 0x1.696D30p+0f,
+  0x1.698C84p+0f, 0x1.69ABD8p+0f, 0x1.69CB30p+0f, 0x1.69EA8Ap+0f,
+  0x1.6A09E6p+0f, 0x1.6A2946p+0f, 0x1.6A48A8p+0f, 0x1.6A680Ep+0f,
+  0x1.6A8776p+0f, 0x1.6AA6E0p+0f, 0x1.6AC64Ep+0f, 0x1.6AE5BCp+0f,
+  0x1.6B0530p+0f, 0x1.6B24A6p+0f, 0x1.6B441Ep+0f, 0x1.6B6398p+0f,
+  0x1.6B8316p+0f, 0x1.6BA296p+0f, 0x1.6BC21Ap+0f, 0x1.6BE19Ep+0f,
+  0x1.6C0128p+0f, 0x1.6C20B2p+0f, 0x1.6C4040p+0f, 0x1.6C5FD2p+0f,
+  0x1.6C7F64p+0f, 0x1.6C9EFAp+0f, 0x1.6CBE94p+0f, 0x1.6CDE30p+0f,
+  0x1.6CFDCEp+0f, 0x1.6D1D70p+0f, 0x1.6D3D12p+0f, 0x1.6D5CBAp+0f,
+  0x1.6D7C62p+0f, 0x1.6D9C0Ep+0f, 0x1.6DBBBEp+0f, 0x1.6DDB70p+0f,
+  0x1.6DFB24p+0f, 0x1.6E1ADAp+0f, 0x1.6E3A94p+0f, 0x1.6E5A52p+0f,
+  0x1.6E7A10p+0f, 0x1.6E99D2p+0f, 0x1.6EB998p+0f, 0x1.6ED960p+0f,
+  0x1.6EF92Ap+0f, 0x1.6F18F6p+0f, 0x1.6F38C6p+0f, 0x1.6F589Ap+0f,
+  0x1.6F786Ep+0f, 0x1.6F9846p+0f, 0x1.6FB822p+0f, 0x1.6FD800p+0f,
+  0x1.6FF7E0p+0f, 0x1.7017C2p+0f, 0x1.7037A8p+0f, 0x1.705792p+0f,
+  0x1.70777Cp+0f, 0x1.70976Cp+0f, 0x1.70B75Cp+0f, 0x1.70D750p+0f,
+  0x1.70F746p+0f, 0x1.711740p+0f, 0x1.71373Cp+0f, 0x1.71573Ap+0f,
+  0x1.71773Cp+0f, 0x1.719740p+0f, 0x1.71B748p+0f, 0x1.71D752p+0f,
+  0x1.71F75Ep+0f, 0x1.72176Ep+0f, 0x1.723780p+0f, 0x1.725796p+0f,
+  0x1.7277AEp+0f, 0x1.7297C8p+0f, 0x1.72B7E6p+0f, 0x1.72D806p+0f,
+  0x1.72F828p+0f, 0x1.73184Ep+0f, 0x1.733876p+0f, 0x1.7358A2p+0f,
+  0x1.7378D0p+0f, 0x1.739902p+0f, 0x1.73B934p+0f, 0x1.73D96Cp+0f,
+  0x1.73F9A4p+0f, 0x1.7419E0p+0f, 0x1.743A20p+0f, 0x1.745A62p+0f,
+  0x1.747AA6p+0f, 0x1.749AECp+0f, 0x1.74BB36p+0f, 0x1.74DB84p+0f,
+  0x1.74FBD4p+0f, 0x1.751C26p+0f, 0x1.753C7Cp+0f, 0x1.755CD4p+0f,
+  0x1.757D2Ep+0f, 0x1.759D8Cp+0f, 0x1.75BDECp+0f, 0x1.75DE50p+0f,
+  0x1.75FEB6p+0f, 0x1.761F1Ep+0f, 0x1.763F8Ap+0f, 0x1.765FF8p+0f,
+  0x1.76806Ap+0f, 0x1.76A0DEp+0f, 0x1.76C154p+0f, 0x1.76E1CEp+0f,
+  0x1.77024Cp+0f, 0x1.7722CAp+0f, 0x1.77434Cp+0f, 0x1.7763D2p+0f,
+  0x1.77845Ap+0f, 0x1.77A4E4p+0f, 0x1.77C572p+0f, 0x1.77E602p+0f,
+  0x1.780694p+0f, 0x1.78272Ap+0f, 0x1.7847C4p+0f, 0x1.786860p+0f,
+  0x1.7888FEp+0f, 0x1.78A99Ep+0f, 0x1.78CA42p+0f, 0x1.78EAEAp+0f,
+  0x1.790B94p+0f, 0x1.792C40p+0f, 0x1.794CF0p+0f, 0x1.796DA2p+0f,
+  0x1.798E56p+0f, 0x1.79AF0Ep+0f, 0x1.79CFCAp+0f, 0x1.79F086p+0f,
+  0x1.7A1148p+0f, 0x1.7A320Ap+0f, 0x1.7A52D0p+0f, 0x1.7A739Ap+0f,
+  0x1.7A9466p+0f, 0x1.7AB534p+0f, 0x1.7AD606p+0f, 0x1.7AF6DAp+0f,
+  0x1.7B17B0p+0f, 0x1.7B388Ap+0f, 0x1.7B5968p+0f, 0x1.7B7A48p+0f,
+  0x1.7B9B2Ap+0f, 0x1.7BBC0Ep+0f, 0x1.7BDCF8p+0f, 0x1.7BFDE2p+0f,
+  0x1.7C1ED0p+0f, 0x1.7C3FC0p+0f, 0x1.7C60B4p+0f, 0x1.7C81AAp+0f,
+  0x1.7CA2A4p+0f, 0x1.7CC3A0p+0f, 0x1.7CE4A0p+0f, 0x1.7D05A2p+0f,
+  0x1.7D26A6p+0f, 0x1.7D47AEp+0f, 0x1.7D68B8p+0f, 0x1.7D89C6p+0f,
+  0x1.7DAAD6p+0f, 0x1.7DCBE8p+0f, 0x1.7DECFEp+0f, 0x1.7E0E18p+0f,
+  0x1.7E2F34p+0f, 0x1.7E5052p+0f, 0x1.7E7174p+0f, 0x1.7E9298p+0f,
+  0x1.7EB3BEp+0f, 0x1.7ED4E8p+0f, 0x1.7EF616p+0f, 0x1.7F1746p+0f,
+  0x1.7F3878p+0f, 0x1.7F59AEp+0f, 0x1.7F7AE6p+0f, 0x1.7F9C22p+0f,
+  0x1.7FBD60p+0f, 0x1.7FDEA0p+0f, 0x1.7FFFE4p+0f, 0x1.80212Cp+0f,
+  0x1.804276p+0f, 0x1.8063C2p+0f, 0x1.808512p+0f, 0x1.80A664p+0f,
+  0x1.80C7B8p+0f, 0x1.80E912p+0f, 0x1.810A6Cp+0f, 0x1.812BCAp+0f,
+  0x1.814D2Ap+0f, 0x1.816E8Ep+0f, 0x1.818FF6p+0f, 0x1.81B15Ep+0f,
+  0x1.81D2CCp+0f, 0x1.81F43Ap+0f, 0x1.8215ACp+0f, 0x1.823722p+0f,
+  0x1.82589Ap+0f, 0x1.827A14p+0f, 0x1.829B92p+0f, 0x1.82BD12p+0f,
+  0x1.82DE96p+0f, 0x1.83001Ep+0f, 0x1.8321A6p+0f, 0x1.834332p+0f,
+  0x1.8364C2p+0f, 0x1.838654p+0f, 0x1.83A7EAp+0f, 0x1.83C982p+0f,
+  0x1.83EB1Cp+0f, 0x1.840CBAp+0f, 0x1.842E5Ap+0f, 0x1.844FFEp+0f,
+  0x1.8471A4p+0f, 0x1.84934Ep+0f, 0x1.84B4FAp+0f, 0x1.84D6AAp+0f,
+  0x1.84F85Cp+0f, 0x1.851A10p+0f, 0x1.853BC8p+0f, 0x1.855D84p+0f,
+  0x1.857F42p+0f, 0x1.85A102p+0f, 0x1.85C2C6p+0f, 0x1.85E48Cp+0f,
+  0x1.860656p+0f, 0x1.862822p+0f, 0x1.8649F2p+0f, 0x1.866BC4p+0f,
+  0x1.868D9Ap+0f, 0x1.86AF72p+0f, 0x1.86D14Ep+0f, 0x1.86F32Cp+0f,
+  0x1.87150Cp+0f, 0x1.8736F0p+0f, 0x1.8758D6p+0f, 0x1.877AC0p+0f,
+  0x1.879CAEp+0f, 0x1.87BE9Ep+0f, 0x1.87E090p+0f, 0x1.880286p+0f,
+  0x1.88247Ep+0f, 0x1.88467Ap+0f, 0x1.886878p+0f, 0x1.888A7Ap+0f,
+  0x1.88AC7Ep+0f, 0x1.88CE84p+0f, 0x1.88F090p+0f, 0x1.89129Cp+0f,
+  0x1.8934ACp+0f, 0x1.8956C0p+0f, 0x1.8978D6p+0f, 0x1.899AEEp+0f,
+  0x1.89BD0Ap+0f, 0x1.89DF2Ap+0f, 0x1.8A014Ap+0f, 0x1.8A2370p+0f,
+  0x1.8A4598p+0f, 0x1.8A67C2p+0f, 0x1.8A89F0p+0f, 0x1.8AAC20p+0f,
+  0x1.8ACE54p+0f, 0x1.8AF08Ap+0f, 0x1.8B12C4p+0f, 0x1.8B3500p+0f,
+  0x1.8B5740p+0f, 0x1.8B7982p+0f, 0x1.8B9BC8p+0f, 0x1.8BBE10p+0f,
+  0x1.8BE05Cp+0f, 0x1.8C02AAp+0f, 0x1.8C24FCp+0f, 0x1.8C4750p+0f,
+  0x1.8C69A6p+0f, 0x1.8C8C00p+0f, 0x1.8CAE5Ep+0f, 0x1.8CD0BEp+0f,
+  0x1.8CF322p+0f, 0x1.8D1588p+0f, 0x1.8D37F0p+0f, 0x1.8D5A5Cp+0f,
+  0x1.8D7CCCp+0f, 0x1.8D9F3Ep+0f, 0x1.8DC1B2p+0f, 0x1.8DE42Ap+0f,
+  0x1.8E06A6p+0f, 0x1.8E2924p+0f, 0x1.8E4BA4p+0f, 0x1.8E6E28p+0f,
+  0x1.8E90B0p+0f, 0x1.8EB33Ap+0f, 0x1.8ED5C6p+0f, 0x1.8EF856p+0f,
+  0x1.8F1AEAp+0f, 0x1.8F3D80p+0f, 0x1.8F6018p+0f, 0x1.8F82B4p+0f,
+  0x1.8FA554p+0f, 0x1.8FC7F6p+0f, 0x1.8FEA9Ap+0f, 0x1.900D42p+0f,
+  0x1.902FEEp+0f, 0x1.90529Ap+0f, 0x1.90754Cp+0f, 0x1.909800p+0f,
+  0x1.90BAB6p+0f, 0x1.90DD70p+0f, 0x1.91002Ep+0f, 0x1.9122EEp+0f,
+  0x1.9145B0p+0f, 0x1.916876p+0f, 0x1.918B40p+0f, 0x1.91AE0Cp+0f,
+  0x1.91D0DAp+0f, 0x1.91F3ACp+0f, 0x1.921682p+0f, 0x1.92395Ap+0f,
+  0x1.925C36p+0f, 0x1.927F14p+0f, 0x1.92A1F4p+0f, 0x1.92C4D8p+0f,
+  0x1.92E7C0p+0f, 0x1.930AAAp+0f, 0x1.932D98p+0f, 0x1.935088p+0f,
+  0x1.93737Cp+0f, 0x1.939672p+0f, 0x1.93B96Ap+0f, 0x1.93DC68p+0f,
+  0x1.93FF66p+0f, 0x1.94226Ap+0f, 0x1.94456Ep+0f, 0x1.946878p+0f,
+  0x1.948B82p+0f, 0x1.94AE92p+0f, 0x1.94D1A2p+0f, 0x1.94F4B8p+0f,
+  0x1.9517D0p+0f, 0x1.953AEAp+0f, 0x1.955E08p+0f, 0x1.958128p+0f,
+  0x1.95A44Cp+0f, 0x1.95C774p+0f, 0x1.95EA9Ep+0f, 0x1.960DCAp+0f,
+  0x1.9630FAp+0f, 0x1.96542Ep+0f, 0x1.967764p+0f, 0x1.969A9Ep+0f,
+  0x1.96BDDAp+0f, 0x1.96E118p+0f, 0x1.97045Cp+0f, 0x1.9727A0p+0f,
+  0x1.974AEAp+0f, 0x1.976E34p+0f, 0x1.979184p+0f, 0x1.97B4D6p+0f,
+  0x1.97D82Ap+0f, 0x1.97FB82p+0f, 0x1.981EDCp+0f, 0x1.98423Ap+0f,
+  0x1.98659Cp+0f, 0x1.988900p+0f, 0x1.98AC66p+0f, 0x1.98CFD2p+0f,
+  0x1.98F33Ep+0f, 0x1.9916AEp+0f, 0x1.993A22p+0f, 0x1.995D98p+0f,
+  0x1.998112p+0f, 0x1.99A48Ep+0f, 0x1.99C80Ep+0f, 0x1.99EB92p+0f,
+  0x1.9A0F18p+0f, 0x1.9A32A0p+0f, 0x1.9A562Cp+0f, 0x1.9A79BCp+0f,
+  0x1.9A9D4Ep+0f, 0x1.9AC0E2p+0f, 0x1.9AE47Ap+0f, 0x1.9B0816p+0f,
+  0x1.9B2BB4p+0f, 0x1.9B4F56p+0f, 0x1.9B72FCp+0f, 0x1.9B96A2p+0f,
+  0x1.9BBA4Ep+0f, 0x1.9BDDFCp+0f, 0x1.9C01ACp+0f, 0x1.9C2560p+0f,
+  0x1.9C4918p+0f, 0x1.9C6CD2p+0f, 0x1.9C9090p+0f, 0x1.9CB450p+0f,
+  0x1.9CD814p+0f, 0x1.9CFBDAp+0f, 0x1.9D1FA4p+0f, 0x1.9D4372p+0f,
+  0x1.9D6742p+0f, 0x1.9D8B14p+0f, 0x1.9DAEEAp+0f, 0x1.9DD2C4p+0f,
+  0x1.9DF6A0p+0f, 0x1.9E1A80p+0f, 0x1.9E3E62p+0f, 0x1.9E6248p+0f,
+  0x1.9E8632p+0f, 0x1.9EAA1Ep+0f, 0x1.9ECE0Cp+0f, 0x1.9EF1FEp+0f,
+  0x1.9F15F4p+0f, 0x1.9F39ECp+0f, 0x1.9F5DE8p+0f, 0x1.9F81E8p+0f,
+  0x1.9FA5E8p+0f, 0x1.9FC9EEp+0f, 0x1.9FEDF6p+0f, 0x1.A01200p+0f,
+  0x1.A03610p+0f, 0x1.A05A20p+0f, 0x1.A07E36p+0f, 0x1.A0A24Cp+0f,
+  0x1.A0C668p+0f, 0x1.A0EA86p+0f, 0x1.A10EA6p+0f, 0x1.A132CAp+0f,
+  0x1.A156F2p+0f, 0x1.A17B1Cp+0f, 0x1.A19F4Ap+0f, 0x1.A1C37Ap+0f,
+  0x1.A1E7AEp+0f, 0x1.A20BE6p+0f, 0x1.A23020p+0f, 0x1.A2545Ep+0f,
+  0x1.A2789Ep+0f, 0x1.A29CE2p+0f, 0x1.A2C128p+0f, 0x1.A2E572p+0f,
+  0x1.A309BEp+0f, 0x1.A32E0Ep+0f, 0x1.A35262p+0f, 0x1.A376B8p+0f,
+  0x1.A39B12p+0f, 0x1.A3BF6Ep+0f, 0x1.A3E3CEp+0f, 0x1.A40832p+0f,
+  0x1.A42C98p+0f, 0x1.A45102p+0f, 0x1.A4756Ep+0f, 0x1.A499DEp+0f,
+  0x1.A4BE50p+0f, 0x1.A4E2C6p+0f, 0x1.A50740p+0f, 0x1.A52BBCp+0f,
+  0x1.A5503Cp+0f, 0x1.A574BEp+0f, 0x1.A59944p+0f, 0x1.A5BDCCp+0f,
+  0x1.A5E258p+0f, 0x1.A606E8p+0f, 0x1.A62B7Ap+0f, 0x1.A65010p+0f,
+  0x1.A674A8p+0f, 0x1.A69944p+0f, 0x1.A6BDE4p+0f, 0x1.A6E286p+0f,
+  0x1.A7072Cp+0f, 0x1.A72BD4p+0f, 0x1.A75080p+0f, 0x1.A77530p+0f,
+  0x1.A799E2p+0f, 0x1.A7BE96p+0f, 0x1.A7E350p+0f, 0x1.A8080Ap+0f,
+  0x1.A82CCAp+0f, 0x1.A8518Cp+0f, 0x1.A87652p+0f, 0x1.A89B1Ap+0f,
+  0x1.A8BFE6p+0f, 0x1.A8E4B4p+0f, 0x1.A90986p+0f, 0x1.A92E5Cp+0f,
+  0x1.A95334p+0f, 0x1.A97810p+0f, 0x1.A99CEEp+0f, 0x1.A9C1D0p+0f,
+  0x1.A9E6B6p+0f, 0x1.AA0B9Ep+0f, 0x1.AA308Ap+0f, 0x1.AA5578p+0f,
+  0x1.AA7A6Ap+0f, 0x1.AA9F60p+0f, 0x1.AAC458p+0f, 0x1.AAE954p+0f,
+  0x1.AB0E52p+0f, 0x1.AB3354p+0f, 0x1.AB585Ap+0f, 0x1.AB7D62p+0f,
+  0x1.ABA26Ep+0f, 0x1.ABC77Cp+0f, 0x1.ABEC8Ep+0f, 0x1.AC11A4p+0f,
+  0x1.AC36BCp+0f, 0x1.AC5BD8p+0f, 0x1.AC80F6p+0f, 0x1.ACA618p+0f,
+  0x1.ACCB3Ep+0f, 0x1.ACF066p+0f, 0x1.AD1592p+0f, 0x1.AD3AC2p+0f,
+  0x1.AD5FF4p+0f, 0x1.AD852Ap+0f, 0x1.ADAA62p+0f, 0x1.ADCF9Ep+0f,
+  0x1.ADF4DCp+0f, 0x1.AE1A20p+0f, 0x1.AE3F64p+0f, 0x1.AE64AEp+0f,
+  0x1.AE89FAp+0f, 0x1.AEAF48p+0f, 0x1.AED49Cp+0f, 0x1.AEF9F2p+0f,
+  0x1.AF1F4Ap+0f, 0x1.AF44A6p+0f, 0x1.AF6A06p+0f, 0x1.AF8F68p+0f,
+  0x1.AFB4CEp+0f, 0x1.AFDA38p+0f, 0x1.AFFFA4p+0f, 0x1.B02514p+0f,
+  0x1.B04A86p+0f, 0x1.B06FFCp+0f, 0x1.B09576p+0f, 0x1.B0BAF2p+0f,
+  0x1.B0E072p+0f, 0x1.B105F6p+0f, 0x1.B12B7Cp+0f, 0x1.B15106p+0f,
+  0x1.B17692p+0f, 0x1.B19C22p+0f, 0x1.B1C1B6p+0f, 0x1.B1E74Cp+0f,
+  0x1.B20CE6p+0f, 0x1.B23284p+0f, 0x1.B25824p+0f, 0x1.B27DC8p+0f,
+  0x1.B2A370p+0f, 0x1.B2C91Ap+0f, 0x1.B2EEC6p+0f, 0x1.B31478p+0f,
+  0x1.B33A2Cp+0f, 0x1.B35FE2p+0f, 0x1.B3859Ep+0f, 0x1.B3AB5Cp+0f,
+  0x1.B3D11Cp+0f, 0x1.B3F6E0p+0f, 0x1.B41CA8p+0f, 0x1.B44274p+0f,
+  0x1.B46842p+0f, 0x1.B48E12p+0f, 0x1.B4B3E8p+0f, 0x1.B4D9C0p+0f,
+  0x1.B4FF9Ap+0f, 0x1.B5257Ap+0f, 0x1.B54B5Cp+0f, 0x1.B57140p+0f,
+  0x1.B59728p+0f, 0x1.B5BD14p+0f, 0x1.B5E304p+0f, 0x1.B608F6p+0f,
+  0x1.B62EECp+0f, 0x1.B654E4p+0f, 0x1.B67AE0p+0f, 0x1.B6A0E0p+0f,
+  0x1.B6C6E2p+0f, 0x1.B6ECE8p+0f, 0x1.B712F2p+0f, 0x1.B738FEp+0f,
+  0x1.B75F0Ep+0f, 0x1.B78522p+0f, 0x1.B7AB38p+0f, 0x1.B7D152p+0f,
+  0x1.B7F770p+0f, 0x1.B81D90p+0f, 0x1.B843B4p+0f, 0x1.B869DAp+0f,
+  0x1.B89004p+0f, 0x1.B8B632p+0f, 0x1.B8DC64p+0f, 0x1.B90298p+0f,
+  0x1.B928D0p+0f, 0x1.B94F0Ap+0f, 0x1.B97548p+0f, 0x1.B99B8Ap+0f,
+  0x1.B9C1CEp+0f, 0x1.B9E816p+0f, 0x1.BA0E62p+0f, 0x1.BA34B0p+0f,
+  0x1.BA5B04p+0f, 0x1.BA8158p+0f, 0x1.BAA7B2p+0f, 0x1.BACE0Ep+0f,
+  0x1.BAF46Cp+0f, 0x1.BB1AD0p+0f, 0x1.BB4136p+0f, 0x1.BB679Ep+0f,
+  0x1.BB8E0Cp+0f, 0x1.BBB47Cp+0f, 0x1.BBDAEEp+0f, 0x1.BC0166p+0f,
+  0x1.BC27E0p+0f, 0x1.BC4E5Cp+0f, 0x1.BC74DEp+0f, 0x1.BC9B62p+0f,
+  0x1.BCC1EAp+0f, 0x1.BCE874p+0f, 0x1.BD0F02p+0f, 0x1.BD3594p+0f,
+  0x1.BD5C28p+0f, 0x1.BD82C0p+0f, 0x1.BDA95Cp+0f, 0x1.BDCFFAp+0f,
+  0x1.BDF69Cp+0f, 0x1.BE1D42p+0f, 0x1.BE43EAp+0f, 0x1.BE6A96p+0f,
+  0x1.BE9146p+0f, 0x1.BEB7FAp+0f, 0x1.BEDEB0p+0f, 0x1.BF0568p+0f,
+  0x1.BF2C26p+0f, 0x1.BF52E6p+0f, 0x1.BF79AAp+0f, 0x1.BFA070p+0f,
+  0x1.BFC73Cp+0f, 0x1.BFEE08p+0f, 0x1.C014DAp+0f, 0x1.C03BAEp+0f,
+  0x1.C06286p+0f, 0x1.C08962p+0f, 0x1.C0B040p+0f, 0x1.C0D722p+0f,
+  0x1.C0FE06p+0f, 0x1.C124F0p+0f, 0x1.C14BDCp+0f, 0x1.C172CCp+0f,
+  0x1.C199BEp+0f, 0x1.C1C0B4p+0f, 0x1.C1E7AEp+0f, 0x1.C20EAAp+0f,
+  0x1.C235AAp+0f, 0x1.C25CAEp+0f, 0x1.C283B6p+0f, 0x1.C2AAC0p+0f,
+  0x1.C2D1CEp+0f, 0x1.C2F8DEp+0f, 0x1.C31FF4p+0f, 0x1.C3470Cp+0f,
+  0x1.C36E26p+0f, 0x1.C39546p+0f, 0x1.C3BC68p+0f, 0x1.C3E38Ep+0f,
+  0x1.C40AB6p+0f, 0x1.C431E2p+0f, 0x1.C45912p+0f, 0x1.C48046p+0f,
+  0x1.C4A77Cp+0f, 0x1.C4CEB6p+0f, 0x1.C4F5F2p+0f, 0x1.C51D34p+0f,
+  0x1.C54478p+0f, 0x1.C56BC0p+0f, 0x1.C5930Ap+0f, 0x1.C5BA58p+0f,
+  0x1.C5E1AAp+0f, 0x1.C60900p+0f, 0x1.C63058p+0f, 0x1.C657B4p+0f,
+  0x1.C67F12p+0f, 0x1.C6A676p+0f, 0x1.C6CDDCp+0f, 0x1.C6F546p+0f,
+  0x1.C71CB2p+0f, 0x1.C74422p+0f, 0x1.C76B96p+0f, 0x1.C7930Ep+0f,
+  0x1.C7BA88p+0f, 0x1.C7E206p+0f, 0x1.C80988p+0f, 0x1.C8310Ep+0f,
+  0x1.C85896p+0f, 0x1.C88022p+0f, 0x1.C8A7B0p+0f, 0x1.C8CF44p+0f,
+  0x1.C8F6DAp+0f, 0x1.C91E72p+0f, 0x1.C94610p+0f, 0x1.C96DB0p+0f,
+  0x1.C99554p+0f, 0x1.C9BCFCp+0f, 0x1.C9E4A6p+0f, 0x1.CA0C54p+0f,
+  0x1.CA3406p+0f, 0x1.CA5BBAp+0f, 0x1.CA8372p+0f, 0x1.CAAB2Ep+0f,
+  0x1.CAD2EEp+0f, 0x1.CAFAB0p+0f, 0x1.CB2278p+0f, 0x1.CB4A40p+0f,
+  0x1.CB720Ep+0f, 0x1.CB99DEp+0f, 0x1.CBC1B2p+0f, 0x1.CBE98Ap+0f,
+  0x1.CC1164p+0f, 0x1.CC3944p+0f, 0x1.CC6124p+0f, 0x1.CC890Ap+0f,
+  0x1.CCB0F2p+0f, 0x1.CCD8E0p+0f, 0x1.CD00CEp+0f, 0x1.CD28C2p+0f,
+  0x1.CD50B8p+0f, 0x1.CD78B2p+0f, 0x1.CDA0B0p+0f, 0x1.CDC8B0p+0f,
+  0x1.CDF0B6p+0f, 0x1.CE18BEp+0f, 0x1.CE40C8p+0f, 0x1.CE68D8p+0f,
+  0x1.CE90EAp+0f, 0x1.CEB900p+0f, 0x1.CEE118p+0f, 0x1.CF0936p+0f,
+  0x1.CF3156p+0f, 0x1.CF597Ap+0f, 0x1.CF81A0p+0f, 0x1.CFA9CCp+0f,
+  0x1.CFD1FAp+0f, 0x1.CFFA2Ap+0f, 0x1.D02260p+0f, 0x1.D04A98p+0f,
+  0x1.D072D4p+0f, 0x1.D09B14p+0f, 0x1.D0C358p+0f, 0x1.D0EB9Ep+0f,
+  0x1.D113E8p+0f, 0x1.D13C36p+0f, 0x1.D16486p+0f, 0x1.D18CDAp+0f,
+  0x1.D1B532p+0f, 0x1.D1DD8Ep+0f, 0x1.D205EEp+0f, 0x1.D22E50p+0f,
+  0x1.D256B6p+0f, 0x1.D27F20p+0f, 0x1.D2A78Cp+0f, 0x1.D2CFFCp+0f,
+  0x1.D2F870p+0f, 0x1.D320E8p+0f, 0x1.D34962p+0f, 0x1.D371E2p+0f,
+  0x1.D39A64p+0f, 0x1.D3C2EAp+0f, 0x1.D3EB72p+0f, 0x1.D413FEp+0f,
+  0x1.D43C8Ep+0f, 0x1.D46522p+0f, 0x1.D48DBAp+0f, 0x1.D4B654p+0f,
+  0x1.D4DEF2p+0f, 0x1.D50794p+0f, 0x1.D53038p+0f, 0x1.D558E2p+0f,
+  0x1.D5818Ep+0f, 0x1.D5AA3Ep+0f, 0x1.D5D2F0p+0f, 0x1.D5FBA8p+0f,
+  0x1.D62462p+0f, 0x1.D64D20p+0f, 0x1.D675E2p+0f, 0x1.D69EA6p+0f,
+  0x1.D6C76Ep+0f, 0x1.D6F03Ap+0f, 0x1.D7190Ap+0f, 0x1.D741DEp+0f,
+  0x1.D76AB4p+0f, 0x1.D7938Ep+0f, 0x1.D7BC6Cp+0f, 0x1.D7E54Cp+0f,
+  0x1.D80E32p+0f, 0x1.D8371Ap+0f, 0x1.D86006p+0f, 0x1.D888F4p+0f,
+  0x1.D8B1E8p+0f, 0x1.D8DADEp+0f, 0x1.D903D8p+0f, 0x1.D92CD6p+0f,
+  0x1.D955D8p+0f, 0x1.D97EDCp+0f, 0x1.D9A7E4p+0f, 0x1.D9D0F0p+0f,
+  0x1.D9FA00p+0f, 0x1.DA2312p+0f, 0x1.DA4C28p+0f, 0x1.DA7542p+0f,
+  0x1.DA9E60p+0f, 0x1.DAC782p+0f, 0x1.DAF0A6p+0f, 0x1.DB19CEp+0f,
+  0x1.DB42FAp+0f, 0x1.DB6C2Ap+0f, 0x1.DB955Cp+0f, 0x1.DBBE94p+0f,
+  0x1.DBE7CEp+0f, 0x1.DC110Cp+0f, 0x1.DC3A4Cp+0f, 0x1.DC6392p+0f,
+  0x1.DC8CDAp+0f, 0x1.DCB626p+0f, 0x1.DCDF76p+0f, 0x1.DD08C8p+0f,
+  0x1.DD3220p+0f, 0x1.DD5B7Ap+0f, 0x1.DD84D8p+0f, 0x1.DDAE38p+0f,
+  0x1.DDD79Ep+0f, 0x1.DE0106p+0f, 0x1.DE2A72p+0f, 0x1.DE53E2p+0f,
+  0x1.DE7D56p+0f, 0x1.DEA6CEp+0f, 0x1.DED048p+0f, 0x1.DEF9C6p+0f,
+  0x1.DF2348p+0f, 0x1.DF4CCEp+0f, 0x1.DF7656p+0f, 0x1.DF9FE4p+0f,
+  0x1.DFC974p+0f, 0x1.DFF308p+0f, 0x1.E01C9Ep+0f, 0x1.E0463Ap+0f,
+  0x1.E06FD8p+0f, 0x1.E0997Ap+0f, 0x1.E0C320p+0f, 0x1.E0ECCAp+0f,
+  0x1.E11676p+0f, 0x1.E14028p+0f, 0x1.E169DCp+0f, 0x1.E19394p+0f,
+  0x1.E1BD50p+0f, 0x1.E1E70Ep+0f, 0x1.E210D0p+0f, 0x1.E23A98p+0f,
+  0x1.E26462p+0f, 0x1.E28E2Ep+0f, 0x1.E2B800p+0f, 0x1.E2E1D6p+0f,
+  0x1.E30BAEp+0f, 0x1.E3358Ap+0f, 0x1.E35F6Ap+0f, 0x1.E3894Cp+0f,
+  0x1.E3B334p+0f, 0x1.E3DD1Ep+0f, 0x1.E4070Cp+0f, 0x1.E430FEp+0f,
+  0x1.E45AF4p+0f, 0x1.E484EEp+0f, 0x1.E4AEEAp+0f, 0x1.E4D8EAp+0f,
+  0x1.E502EEp+0f, 0x1.E52CF6p+0f, 0x1.E55702p+0f, 0x1.E58110p+0f,
+  0x1.E5AB24p+0f, 0x1.E5D53Ap+0f, 0x1.E5FF54p+0f, 0x1.E62972p+0f,
+  0x1.E65392p+0f, 0x1.E67DB8p+0f, 0x1.E6A7E0p+0f, 0x1.E6D20Cp+0f,
+  0x1.E6FC3Cp+0f, 0x1.E72670p+0f, 0x1.E750A6p+0f, 0x1.E77AE2p+0f,
+  0x1.E7A520p+0f, 0x1.E7CF62p+0f, 0x1.E7F9A8p+0f, 0x1.E823F2p+0f,
+  0x1.E84E3Ep+0f, 0x1.E87890p+0f, 0x1.E8A2E4p+0f, 0x1.E8CD3Cp+0f,
+  0x1.E8F798p+0f, 0x1.E921F6p+0f, 0x1.E94C5Ap+0f, 0x1.E976C0p+0f,
+  0x1.E9A12Cp+0f, 0x1.E9CB9Ap+0f, 0x1.E9F60Cp+0f, 0x1.EA2080p+0f,
+  0x1.EA4AFAp+0f, 0x1.EA7578p+0f, 0x1.EA9FF8p+0f, 0x1.EACA7Cp+0f,
+  0x1.EAF504p+0f, 0x1.EB1F90p+0f, 0x1.EB4A1Ep+0f, 0x1.EB74B2p+0f,
+  0x1.EB9F48p+0f, 0x1.EBC9E2p+0f, 0x1.EBF480p+0f, 0x1.EC1F22p+0f,
+  0x1.EC49C8p+0f, 0x1.EC7472p+0f, 0x1.EC9F1Ep+0f, 0x1.ECC9CEp+0f,
+  0x1.ECF482p+0f, 0x1.ED1F3Ap+0f, 0x1.ED49F6p+0f, 0x1.ED74B6p+0f,
+  0x1.ED9F78p+0f, 0x1.EDCA40p+0f, 0x1.EDF50Ap+0f, 0x1.EE1FD8p+0f,
+  0x1.EE4AAAp+0f, 0x1.EE7580p+0f, 0x1.EEA05Ap+0f, 0x1.EECB36p+0f,
+  0x1.EEF616p+0f, 0x1.EF20FCp+0f, 0x1.EF4BE4p+0f, 0x1.EF76D0p+0f,
+  0x1.EFA1BEp+0f, 0x1.EFCCB2p+0f, 0x1.EFF7AAp+0f, 0x1.F022A4p+0f,
+  0x1.F04DA2p+0f, 0x1.F078A4p+0f, 0x1.F0A3AAp+0f, 0x1.F0CEB4p+0f,
+  0x1.F0F9C2p+0f, 0x1.F124D2p+0f, 0x1.F14FE8p+0f, 0x1.F17B00p+0f,
+  0x1.F1A61Cp+0f, 0x1.F1D13Cp+0f, 0x1.F1FC60p+0f, 0x1.F22788p+0f,
+  0x1.F252B4p+0f, 0x1.F27DE2p+0f, 0x1.F2A916p+0f, 0x1.F2D44Cp+0f,
+  0x1.F2FF86p+0f, 0x1.F32AC4p+0f, 0x1.F35606p+0f, 0x1.F3814Cp+0f,
+  0x1.F3AC94p+0f, 0x1.F3D7E2p+0f, 0x1.F40332p+0f, 0x1.F42E86p+0f,
+  0x1.F459E0p+0f, 0x1.F4853Cp+0f, 0x1.F4B09Ap+0f, 0x1.F4DBFEp+0f,
+  0x1.F50766p+0f, 0x1.F532D0p+0f, 0x1.F55E40p+0f, 0x1.F589B2p+0f,
+  0x1.F5B528p+0f, 0x1.F5E0A2p+0f, 0x1.F60C20p+0f, 0x1.F637A2p+0f,
+  0x1.F66328p+0f, 0x1.F68EB0p+0f, 0x1.F6BA3Ep+0f, 0x1.F6E5CEp+0f,
+  0x1.F71164p+0f, 0x1.F73CFCp+0f, 0x1.F76898p+0f, 0x1.F79438p+0f,
+  0x1.F7BFDAp+0f, 0x1.F7EB82p+0f, 0x1.F8172Ep+0f, 0x1.F842DCp+0f,
+  0x1.F86E90p+0f, 0x1.F89A46p+0f, 0x1.F8C600p+0f, 0x1.F8F1BEp+0f,
+  0x1.F91D80p+0f, 0x1.F94946p+0f, 0x1.F97510p+0f, 0x1.F9A0DCp+0f,
+  0x1.F9CCAEp+0f, 0x1.F9F882p+0f, 0x1.FA245Cp+0f, 0x1.FA5038p+0f,
+  0x1.FA7C18p+0f, 0x1.FAA7FCp+0f, 0x1.FAD3E4p+0f, 0x1.FAFFD0p+0f,
+  0x1.FB2BC0p+0f, 0x1.FB57B2p+0f, 0x1.FB83AAp+0f, 0x1.FBAFA4p+0f,
+  0x1.FBDBA4p+0f, 0x1.FC07A6p+0f, 0x1.FC33ACp+0f, 0x1.FC5FB6p+0f,
+  0x1.FC8BC4p+0f, 0x1.FCB7D6p+0f, 0x1.FCE3ECp+0f, 0x1.FD1006p+0f,
+  0x1.FD3C22p+0f, 0x1.FD6844p+0f, 0x1.FD9468p+0f, 0x1.FDC092p+0f,
+  0x1.FDECBEp+0f, 0x1.FE18EEp+0f, 0x1.FE4522p+0f, 0x1.FE715Ap+0f,
+  0x1.FE9D96p+0f, 0x1.FEC9D6p+0f, 0x1.FEF61Ap+0f, 0x1.FF2262p+0f,
+  0x1.FF4EACp+0f, 0x1.FF7AFCp+0f, 0x1.FFA74Ep+0f, 0x1.FFD3A6p+0f,
+};
+
+void xnn_math_f32_sigmoid__neon_lut2048_p1_nr2recps(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
+  // Last 18 bits are zeroes
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.600000p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(0x1.7217F8p-19f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
+
+  const int32x4_t vindex_mask = vmovq_n_s32(INT32_C(0x7FF));
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(input); input += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z * 2048 / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of the result to an integer, then subtracing
+    // the large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|z * 2048 / log(2)| <= 2**22, i.e.
+    // |z| <= 0x1.62E43p+10 = 1419.5654296875), but that is acceptable, because inputs x outside of
+    // [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x). We fixup the result
+    // for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e_x2048);
+
+    // Create a floating-point number s (scale) such that s := 2**(n / 2048) for such inputs that sigmoidf(-z) is
+    // normalized, i.e. 0 <= z <= 87.33642. As n has 11 fractional bits, we split s == 2**(n / 2048) =
+    // = 2**e * 2**(n / 2048 - e), where e := int(n / 2048). We create s in two steps:
+    // 1. Fetch 2**(n / 2048 - e) = 2**(n % 2048) from exp2_k_over_2048_table using the 6 low bits of n, as integer. Note that the
+    //    fetched values are in the [1.0, 2.0) range, i.e. their floating-point exponent is 0.
+    // 2. Adjust fecthed value by addition of e to its floating-point exponent. The result is always a normalized
+    //    number, because for 0 <= z <= 87.33642 (inputs for which sigmoidf(-z) is normalized) we have -126 <= e <= 0,
+    //    and thus the adjusted exponent is not lower than -126.
+    //
+    // Extract e from bits 11:19 of n and shift it into bits 23:31 (position of floating-point exponent).
+    const int32x4_t ve = vshlq_n_s32(vbicq_s32(vreinterpretq_s32_f32(vn), vmovq_n_s32(INT32_C(0x7FF))), 12);
+
+    // Use bits 0:11 bits of n, as integer, as an index for table lookup of l := 2**(n % 2048).
+    const uint64x2_t vidx = vreinterpretq_u64_s32(vandq_s32(vreinterpretq_s32_f32(vn), vindex_mask));
+    const uint64_t vidx01 = vgetq_lane_u64(vidx, 0);
+    const uint64_t vidx23 = vgetq_lane_u64(vidx, 1);
+    float32x2_t vl01 = vld1_dup_f32(&exp2_k_over_2048_table[(uint32_t) vidx01]);
+    float32x2_t vl23 = vld1_dup_f32(&exp2_k_over_2048_table[(uint32_t) vidx23]);
+    vl01 = vld1_lane_f32(&exp2_k_over_2048_table[(uint32_t) (vidx01 >> 32)], vl01, 1);
+    vl23 = vld1_lane_f32(&exp2_k_over_2048_table[(uint32_t) (vidx23 >> 32)], vl23, 1);
+    const float32x4_t vl = vcombine_f32(vl01, vl23);
+    // Adjust exponent of the value l fetched from the exp2_k_over_2048_table to get the final s value.
+    const float32x4_t vs = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(vl), ve));
+
+    // Subtract the large number back to get the final n := round(-z * 2048 / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := (z + n * log(2) / 2048). Note that -t = -z - n * log(2) / 2048.
+    // Use Cody-Waite range reduction method (note two constants to represent log(2) / 2048) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_o2048_hi);
+    vt = vmlaq_f32(vt, vn, vln2_o2048_lo);
+
+    // Compute degree-1 polynomial approximation for exp(-t) on [-log(2)/2048, log(2)/2048]:
+    //   P1(t) = 1 + t * c1
+    const float32x4_t vp = vmulq_f32(vt, vc1);
+
+    // Reconstruct the exp(-z) value:
+    //   y = s * (1 + t * c1)
+    //     = s + s * (t * c1))
+    //     = s + s * p
+    const float32x4_t vy = vmlaq_f32(vs, vs, vp);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    const float32x4_t vd = vaddq_f32(vy, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(vy, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(output, vf); output += 4;
+  }
+}

diff --git a/src/math/sigmoid-neon-p5-nr2recps.c b/src/math/sigmoid-neon-p5-nr2recps.c
new file mode 100644
index 0000000..ea409ab
--- /dev/null
+++ b/src/math/sigmoid-neon-p5-nr2recps.c

@@ -0,0 +1,106 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_sigmoid__neon_p5_nr2recps(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
+  const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E400p-1f);
+  const float32x4_t vln2_lo = vmovq_n_f32(0x1.7F7D1Cp-20f);
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFF6p-1f);
+  const float32x4_t vc2 = vmovq_n_f32(0x1.FFFDC6p-2f);
+  const float32x4_t vc3 = vmovq_n_f32(-0x1.555A80p-3f);
+  const float32x4_t vc4 = vmovq_n_f32(0x1.573A1Ap-5f);
+  const float32x4_t vc5 = vmovq_n_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(input); input += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const float32x4_t vz = vabsq_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    float32x4_t vn = vmlaq_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = vsubq_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    float32x4_t vt = vmlaq_f32(vz, vn, vln2_hi);
+    vt = vmlaq_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    float32x4_t vp = vmlaq_f32(vc4, vc5, vt);
+    vp = vmlaq_f32(vc3, vp, vt);
+    vp = vmlaq_f32(vc2, vp, vt);
+    vp = vmlaq_f32(vc1, vp, vt);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = vmulq_f32(vt, vs);
+    float32x4_t ve = vmlaq_f32(vs, vp, vt);
+
+    // Denominator of the sigmoid fraction: 1.0 + exp(-z)
+    float32x4_t vd = vaddq_f32(ve, vone);
+
+    // Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
+    // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
+    // Thus the reciprocal of the denominator never overflows.
+    float32x4_t vr = vrecpeq_f32(vd);
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+    vr = vmulq_f32(vr, vrecpsq_f32(vr, vd));
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    float32x4_t vf = vmulq_f32(ve, vr);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
+    vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
+
+    vst1q_f32(output, vf); output += 4;
+  }
+}

diff --git a/src/math/sigmoid-neonfma-lut2048-p1-div.c b/src/math/sigmoid-neonfma-lut2048-p1-div.c
index 6d3da1d..7813073 100644
--- a/src/math/sigmoid-neonfma-lut2048-p1-div.c
+++ b/src/math/sigmoid-neonfma-lut2048-p1-div.c

@@ -535,14 +535,12 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
-  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62e43p-12f);
-  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05c61p-40f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
   const float32x4_t vone = vmovq_n_f32(1.0f);
 
   const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
@@ -618,18 +616,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vdivq_f32(vy, vd);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-lut2048-p1-nr1recps1fma.c b/src/math/sigmoid-neonfma-lut2048-p1-nr1recps1fma.c
index c29e3f0..efc07cc 100644
--- a/src/math/sigmoid-neonfma-lut2048-p1-nr1recps1fma.c
+++ b/src/math/sigmoid-neonfma-lut2048-p1-nr1recps1fma.c

@@ -535,14 +535,12 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
-  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62e43p-12f);
-  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05c61p-40f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
   const float32x4_t vone = vmovq_n_f32(1.0f);
 
   const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
@@ -625,18 +623,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(vy, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-lut2048-p1-nr2fma.c b/src/math/sigmoid-neonfma-lut2048-p1-nr2fma.c
index de4e87c..e490562 100644
--- a/src/math/sigmoid-neonfma-lut2048-p1-nr2fma.c
+++ b/src/math/sigmoid-neonfma-lut2048-p1-nr2fma.c

@@ -535,14 +535,12 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
-  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62e43p-12f);
-  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05c61p-40f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
   const float32x4_t vone = vmovq_n_f32(1.0f);
 
   const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
@@ -625,18 +623,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(vy, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-lut2048-p1-nr2recps.c b/src/math/sigmoid-neonfma-lut2048-p1-nr2recps.c
index a8c1972..74ecca5 100644
--- a/src/math/sigmoid-neonfma-lut2048-p1-nr2recps.c
+++ b/src/math/sigmoid-neonfma-lut2048-p1-nr2recps.c

@@ -535,14 +535,12 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.800000p23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e_x2048  = vmovq_n_f32(-0x1.715476p11f);
-  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62e43p-12f);
-  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05c61p-40f);
+  const float32x4_t vln2_o2048_hi = vmovq_n_f32(0x1.62E43p-12f);
+  const float32x4_t vln2_o2048_lo = vmovq_n_f32(-0x1.05C61p-40f);
   const float32x4_t vone = vmovq_n_f32(1.0f);
 
   const float32x4_t vc1 = vmovq_n_f32(-0x1.FFFFFEp-1f);
@@ -625,18 +623,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(vy, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-p5-div.c b/src/math/sigmoid-neonfma-p5-div.c
index 8dd5460..a35cc40 100644
--- a/src/math/sigmoid-neonfma-p5-div.c
+++ b/src/math/sigmoid-neonfma-p5-div.c

@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -87,18 +85,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vdivq_f32(ve, vd);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-p5-nr1recps1fma.c b/src/math/sigmoid-neonfma-p5-nr1recps1fma.c
index 74981da..1c0dac9 100644
--- a/src/math/sigmoid-neonfma-p5-nr1recps1fma.c
+++ b/src/math/sigmoid-neonfma-p5-nr1recps1fma.c

@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -94,18 +92,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(ve, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-p5-nr2fma.c b/src/math/sigmoid-neonfma-p5-nr2fma.c
index 69d235f..bbc2342 100644
--- a/src/math/sigmoid-neonfma-p5-nr2fma.c
+++ b/src/math/sigmoid-neonfma-p5-nr2fma.c

@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -94,18 +92,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(ve, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-neonfma-p5-nr2recps.c b/src/math/sigmoid-neonfma-p5-nr2recps.c
index fdb219f..578da6f 100644
--- a/src/math/sigmoid-neonfma-p5-nr2recps.c
+++ b/src/math/sigmoid-neonfma-p5-nr2recps.c

@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -94,18 +92,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(ve, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }

diff --git a/src/math/sigmoid-psimd-p5-div.c b/src/math/sigmoid-psimd-p5-div.c
new file mode 100644
index 0000000..a580ede
--- /dev/null
+++ b/src/math/sigmoid-psimd-p5-div.c

@@ -0,0 +1,98 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_sigmoid__psimd_p5_div(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
+  const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
+  const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(input);
+    input += 4;
+
+    // General structure of the algorithm:
+    //           / exp(x) / (1 + exp(x)) if x <= 0
+    //   f[x] := 
+    //           \ 1 - f[-x] if x >= 0
+    //
+    // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+    // then replace result with 1 - f[-z] if x >= 0.
+    const psimd_f32 vz = psimd_abs_f32(vx);
+
+    // Compute reduced argument n := round(-z / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
+    // anyway. We fixup the result for such inputs at the very end of the algorithm.
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vln2_lo);
+
+    // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+    //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
+    vp = psimd_qfma_f32(vc3, vt, vp);
+    vp = psimd_qfma_f32(vc2, vt, vp);
+    vp = psimd_qfma_f32(vc1, vt, vp);
+
+    // Reconstruct the exp(-z) value:
+    //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
+
+    // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
+    psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
+
+    // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
+    vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
+
+    psimd_store_f32(output, vf);
+    output += 4;
+  }
+}

diff --git a/src/math/sigmoid-scalar-lut2048-p1-div.c b/src/math/sigmoid-scalar-lut2048-p1-div.c
index 1425f45..6c4a11f 100644
--- a/src/math/sigmoid-scalar-lut2048-p1-div.c
+++ b/src/math/sigmoid-scalar-lut2048-p1-div.c

@@ -538,11 +538,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x2048 = -0x1.715476p11f;
   // Last 18 bits are zeroes
   const float vln2_o2048_hi = 0x1.600000p-12f;
@@ -612,23 +610,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *output++ = vf;
   }
 }

diff --git a/src/math/sigmoid-scalar-lut64-p2-div.c b/src/math/sigmoid-scalar-lut64-p2-div.c
index 005ecd1..b730d8e 100644
--- a/src/math/sigmoid-scalar-lut64-p2-div.c
+++ b/src/math/sigmoid-scalar-lut64-p2-div.c

@@ -42,11 +42,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.800000p23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e_x64 = -0x1.715476p6f;
   // Last 13 bits are zeroes
   const float vln2_o64_hi =  0x1.630000p-7f;
@@ -118,23 +116,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = vy / (vy + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *output++ = vf;
   }
 }

diff --git a/src/math/sigmoid-scalar-p5-div.c b/src/math/sigmoid-scalar-p5-div.c
index 4d89c7d..49ee25b 100644
--- a/src/math/sigmoid-scalar-p5-div.c
+++ b/src/math/sigmoid-scalar-p5-div.c

@@ -22,11 +22,9 @@
   assert(n % sizeof(float) == 0);
 
   const float vmagic_bias = 0x1.8000FEp23f;
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
-  const float vdenorm_cutoff = -0x1.5D589Ep+6f;
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float vone_cutoff = 0x1.154244p+4f;
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
+  const float vdenorm_cutoff = 0x1.5D589Ep+6f;
   const float vminus_log2e = -0x1.715476p+0f;
   // Last 7 bits are zeroes
   const float vln2_hi = 0x1.62E400p-1f;
@@ -88,23 +86,17 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float vf = ve / (ve + vone);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    if XNN_UNPREDICTABLE(vz > vdenorm_cutoff) {
+      vf = 0.0f;
+    }
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     if XNN_UNPREDICTABLE(vx > 0.0f) {
       vf = vone - vf;
     }
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx > vone_cutoff) {
-      vf = vone;
-    }
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    if XNN_UNPREDICTABLE(vx < vdenorm_cutoff) {
-      vf = 0.0f;
-    }
-
     *output++ = vf;
   }
 }

diff --git a/src/math/sigmoid-sse2-p5-div.c b/src/math/sigmoid-sse2-p5-div.c
index 59f8556..28ef25b 100644
--- a/src/math/sigmoid-sse2-p5-div.c
+++ b/src/math/sigmoid-sse2-p5-div.c

@@ -22,10 +22,8 @@
   // The smallest x for which sigmoidf(x) is normalized.
   // This number is also the smallest x for which expf(x) is normalized.
   const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const __m128 vone_cutoff = _mm_set1_ps(0x1.154244p+4f);
   const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
-  // Last 8 bits are zeroes
+  // Last 7 bits are zeroes
   const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
   const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
   const __m128 vone = _mm_set1_ps(1.0f);
@@ -88,19 +86,14 @@
     // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
     __m128 vf = _mm_div_ps(ve, vd);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
     __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
     vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vm = _mm_cmpgt_ps(vx, vone_cutoff);
-    vf = _mm_or_ps(_mm_and_ps(vone, vm), _mm_andnot_ps(vm, vf));
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
-
     _mm_storeu_ps(output, vf);
 
     input += 4;

diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index d786ede..0b3ad2c 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h

@@ -59,15 +59,18 @@
 DECLARE_F32_EXT_UNARY_MATH_FUNCTION(xnn_math_f32_extexp__avx2_p5)
 DECLARE_F32_EXT_UNARY_MATH_FUNCTION(xnn_math_f32_extexp__avx512f_p5)
 
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_div)
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr1recps1fma)
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr2fma)
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr2recps)
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_div)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neon_lut2048_p1_nr2recps)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_nr2recps)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_nr1recps1fma)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_nr2fma)
-DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_nr2recps)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_lut2048_p1_div)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neon_p5_nr2recps)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr2recps)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr1recps1fma)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_nr2fma)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__neonfma_p5_div)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__sse2_p5_div)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__psimd_p5_div)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__scalar_p5_div)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__scalar_lut64_p2_div)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_sigmoid__scalar_lut2048_p1_div)

diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h
index 2cbfdbc..98ff683 100644
--- a/src/xnnpack/vunary.h
+++ b/src/xnnpack/vunary.h

@@ -23,15 +23,98 @@
       float* y,                                     \
       const void* params);
 
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_div_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x12)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2fma_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr1recps1fma_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_p5_nr2recps_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_p5_nr2recps_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_div_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2fma_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr1recps1fma_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neonfma_lut2048_p1_nr2recps_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_lut2048_p1_nr2recps_x24)
 
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__neon_frac_p9_p10_nr1recps_x16)
 
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x4)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x12)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse2_p5_div_x24)
 
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x4)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x12)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__sse41_p5_div_x24)
+
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x4)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x8)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x12)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x16)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x20)
+DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__psimd_p5_div_x24)
 
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x1)
 DECLARE_F32_VUNARY_UKERNEL_FUNCTION(xnn_f32_sigmoid_ukernel__scalar_lut2048_p1_div_x2)
commit	8d3c07e03a55862847e0a6a90f6f9177e87dba4a	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Thu Jan 02 01:20:59 2020 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Thu Jan 02 01:21:29 2020 -0800
tree	e328046667bbb59bdd88ce320abcf29f8857cc9a
parent	279908a1af406a1973069979906d9fae569719fa [diff]