SSE2/PSIMD RAddStoreExpMinusMax micro-kernels PiperOrigin-RevId: 291432270

commit: b39689d3fada469454a634e86d629d07fc21eb25 [log] [tgz]
author: Marat Dukhan <maratek@google.com> Fri Jan 24 13:32:20 2020 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Fri Jan 24 13:32:55 2020 -0800
tree: 464811ca5a878c4ffd2fd03da35715b7ba97a116
parent: f46f67559607339870b77d210cae2f2434ed8f64 [diff]
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
new file mode 100644
index 0000000..8ff55cf
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c

@@ -0,0 +1,244 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
new file mode 100644
index 0000000..89f7a49
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c

@@ -0,0 +1,246 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  psimd_f32 vacc2 = psimd_zero_f32();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc1 = psimd_add_f32(vacc1, vf4567);
+    vacc2 = psimd_add_f32(vacc2, vf89AB);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+  vacc0 = psimd_add_f32(vacc0, vacc2);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
new file mode 100644
index 0000000..b517915
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c

@@ -0,0 +1,241 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+  }
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
new file mode 100644
index 0000000..38db010
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c

@@ -0,0 +1,260 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+    vacc0 = psimd_add_f32(vacc0, vfCDEF);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
new file mode 100644
index 0000000..12882c1
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c

@@ -0,0 +1,264 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  psimd_f32 vacc2 = psimd_zero_f32();
+  psimd_f32 vacc3 = psimd_zero_f32();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+    vacc0 = psimd_add_f32(vacc0, vfCDEF);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+  vacc2 = psimd_add_f32(vacc2, vacc3);
+  vacc0 = psimd_add_f32(vacc0, vacc2);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
new file mode 100644
index 0000000..8329c65
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c

@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+    vacc0 = psimd_add_f32(vacc0, vfCDEF);
+  }
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
new file mode 100644
index 0000000..939c2a5
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c

@@ -0,0 +1,276 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+    const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+    psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+    const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+    psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+    vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+    psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+    vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+    psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+    vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    psimd_store_f32(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+    vacc0 = psimd_add_f32(vacc0, vfCDEF);
+    vacc0 = psimd_add_f32(vacc0, vfGHIJ);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
new file mode 100644
index 0000000..c037620
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c

@@ -0,0 +1,282 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  psimd_f32 vacc2 = psimd_zero_f32();
+  psimd_f32 vacc3 = psimd_zero_f32();
+  psimd_f32 vacc4 = psimd_zero_f32();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+    const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+    psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+    const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+    psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+    vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+    psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+    vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+    psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+    vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    psimd_store_f32(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc4 = psimd_add_f32(vacc4, vf4567);
+    vacc3 = psimd_add_f32(vacc3, vf89AB);
+    vacc2 = psimd_add_f32(vacc2, vfCDEF);
+    vacc1 = psimd_add_f32(vacc1, vfGHIJ);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+  vacc2 = psimd_add_f32(vacc2, vacc3);
+  vacc0 = psimd_add_f32(vacc0, vacc2);
+  vacc0 = psimd_add_f32(vacc0, vacc4);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
new file mode 100644
index 0000000..5b80fa0
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c

@@ -0,0 +1,273 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+    const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+    const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+    const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+    const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+    const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+    psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+    psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+    psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+    const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+    const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+    const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+    vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+    vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+    vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+    psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+    psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+    psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+    vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+    vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+    vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+    psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+    psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+    psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+    vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+    vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+    vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+    vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+    vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+    vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+    psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+    psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+    psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+    vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+    vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+    vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    psimd_store_f32(output + 8, vf89AB);
+    psimd_store_f32(output + 12, vfCDEF);
+    psimd_store_f32(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+    vacc0 = psimd_add_f32(vacc0, vf89AB);
+    vacc0 = psimd_add_f32(vacc0, vfCDEF);
+    vacc0 = psimd_add_f32(vacc0, vfGHIJ);
+  }
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
new file mode 100644
index 0000000..777837d
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c

@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 (1x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+
+    // Store 4 (1x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+  }
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
new file mode 100644
index 0000000..2132ed3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c

@@ -0,0 +1,228 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  psimd_f32 vacc1 = psimd_zero_f32();
+  for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+    // Load 8 (2x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    input += 8;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+
+    // Store 8 (2x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    output += 8;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = psimd_add_f32(vacc0, vacc1);
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
new file mode 100644
index 0000000..06a6a75
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c

@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  psimd_f32 vacc0 = psimd_zero_f32();
+  for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+    // Load 8 (2x4) inputs at a time.
+    const psimd_f32 vi0123 = psimd_load_f32(input);
+    const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+    input += 8;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+    const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+    psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+    const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+    vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+    psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+
+    vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+    vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+    psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+
+    vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+    vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = psimd_mul_f32(vt0123, vs0123);
+    vt4567 = psimd_mul_f32(vt4567, vs4567);
+
+    psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+    psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+    vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+
+    // Store 8 (2x4) outputs at a time.
+    psimd_store_f32(output, vf0123);
+    psimd_store_f32(output + 4, vf4567);
+    output += 8;
+
+    // Accumulate computed exponents.
+    vacc0 = psimd_add_f32(vacc0, vf0123);
+    vacc0 = psimd_add_f32(vacc0, vf4567);
+  }
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
new file mode 100644
index 0000000..ae36a76
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c

@@ -0,0 +1,243 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
new file mode 100644
index 0000000..9761b8f
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c

@@ -0,0 +1,245 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  __m128 vacc2 = _mm_setzero_ps();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc1 = _mm_add_ps(vacc1, vf4567);
+    vacc2 = _mm_add_ps(vacc2, vf89AB);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+  vacc0 = _mm_add_ps(vacc0, vacc2);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
new file mode 100644
index 0000000..9f8cf69
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c

@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+    // Load 12 (3x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    input += 12;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+    // Store 12 (3x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    output += 12;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+  }
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
new file mode 100644
index 0000000..2380227
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c

@@ -0,0 +1,259 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+    vacc0 = _mm_add_ps(vacc0, vfCDEF);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
new file mode 100644
index 0000000..1e60b65
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c

@@ -0,0 +1,263 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  __m128 vacc2 = _mm_setzero_ps();
+  __m128 vacc3 = _mm_setzero_ps();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+    vacc0 = _mm_add_ps(vacc0, vfCDEF);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+  vacc2 = _mm_add_ps(vacc2, vacc3);
+  vacc0 = _mm_add_ps(vacc0, vacc2);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
new file mode 100644
index 0000000..09f542f
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c

@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+    // Load 16 (4x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    input += 16;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+    // Store 16 (4x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    output += 16;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+    vacc0 = _mm_add_ps(vacc0, vfCDEF);
+  }
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
new file mode 100644
index 0000000..f92b791
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c

@@ -0,0 +1,275 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+    const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    _mm_storeu_ps(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+    vacc0 = _mm_add_ps(vacc0, vfCDEF);
+    vacc0 = _mm_add_ps(vacc0, vfGHIJ);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
new file mode 100644
index 0000000..d58661b
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c

@@ -0,0 +1,281 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  __m128 vacc2 = _mm_setzero_ps();
+  __m128 vacc3 = _mm_setzero_ps();
+  __m128 vacc4 = _mm_setzero_ps();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+    const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    _mm_storeu_ps(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc4 = _mm_add_ps(vacc4, vf4567);
+    vacc3 = _mm_add_ps(vacc3, vf89AB);
+    vacc2 = _mm_add_ps(vacc2, vfCDEF);
+    vacc1 = _mm_add_ps(vacc1, vfGHIJ);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+  vacc2 = _mm_add_ps(vacc2, vacc3);
+  vacc0 = _mm_add_ps(vacc0, vacc2);
+  vacc0 = _mm_add_ps(vacc0, vacc4);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
new file mode 100644
index 0000000..3ab5db3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c

@@ -0,0 +1,272 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+    // Load 20 (5x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    const __m128 vi89AB = _mm_loadu_ps(input + 8);
+    const __m128 viCDEF = _mm_loadu_ps(input + 12);
+    const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+    input += 20;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+    const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+    const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+    const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+    __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+    __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+    __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+    const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+    const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+    const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+    vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+    vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+    vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+    __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+    __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+    __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+    vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+    vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+    vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+    __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+    __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+    __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+    vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+    vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+    vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+    vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+    vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+    vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+    __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+    __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+    __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+    vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+    vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+    vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+    // Store 20 (5x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    _mm_storeu_ps(output + 8, vf89AB);
+    _mm_storeu_ps(output + 12, vfCDEF);
+    _mm_storeu_ps(output + 16, vfGHIJ);
+    output += 20;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+    vacc0 = _mm_add_ps(vacc0, vf89AB);
+    vacc0 = _mm_add_ps(vacc0, vfCDEF);
+    vacc0 = _mm_add_ps(vacc0, vfGHIJ);
+  }
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
new file mode 100644
index 0000000..157b576
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c

@@ -0,0 +1,208 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 (1x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+
+    // Store 4 (1x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+  }
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
new file mode 100644
index 0000000..d3ef0b3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c

@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  __m128 vacc1 = _mm_setzero_ps();
+  for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+    // Load 8 (2x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    input += 8;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+
+    // Store 8 (2x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    output += 8;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+  }
+  // Add up all accumulators to vacc0
+  vacc0 = _mm_add_ps(vacc0, vacc1);
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
new file mode 100644
index 0000000..0460e42
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c

@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  __m128 vacc0 = _mm_setzero_ps();
+  for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+    // Load 8 (2x4) inputs at a time.
+    const __m128 vi0123 = _mm_loadu_ps(input);
+    const __m128 vi4567 = _mm_loadu_ps(input + 4);
+    input += 8;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+    const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+    __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+    const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+    vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+    __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+
+    vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+    vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+    __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+
+    vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+    vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt0123 = _mm_mul_ps(vt0123, vs0123);
+    vt4567 = _mm_mul_ps(vt4567, vs4567);
+
+    __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+    __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+    vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+
+    // Store 8 (2x4) outputs at a time.
+    _mm_storeu_ps(output, vf0123);
+    _mm_storeu_ps(output + 4, vf4567);
+    output += 8;
+
+    // Accumulate computed exponents.
+    vacc0 = _mm_add_ps(vacc0, vf0123);
+    vacc0 = _mm_add_ps(vacc0, vf4567);
+  }
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/psimd-p5.c.in b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
new file mode 100644
index 0000000..7cba991
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/psimd-p5.c.in

@@ -0,0 +1,236 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert ELEMENTS_TILE % 4 == 0
+$assert ELEMENTS_TILE >= 4
+$SIMD_TILE = ELEMENTS_TILE // 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+  const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+  const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+  const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+  const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+  const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+  const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+  const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+  const psimd_f32 vi_max = psimd_splat_f32(max);
+
+  $for K in range(ACCUMULATORS):
+    psimd_f32 vacc${K} = psimd_zero_f32();
+  for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
+    // Load ${ELEMENTS_TILE} (${SIMD_TILE}x4) inputs at a time.
+    const psimd_f32 vi${ABC[0:4]} = psimd_load_f32(input);
+    $for N in range(4, ELEMENTS_TILE, 4):
+      const psimd_f32 vi${ABC[N:N+4]} = psimd_load_f32(input + ${N});
+    input += ${ELEMENTS_TILE};
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      const psimd_f32 vx${ABC[N:N+4]} = psimd_sub_f32(vi${ABC[N:N+4]}, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    $for N in range(0, ELEMENTS_TILE, 4):
+      psimd_f32 vn${ABC[N:N+4]} = psimd_qfma_f32(vmagic_bias, vx${ABC[N:N+4]}, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      const psimd_f32 vs${ABC[N:N+4]} = (psimd_f32) ((psimd_u32) vn${ABC[N:N+4]} << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vn${ABC[N:N+4]} = psimd_sub_f32(vn${ABC[N:N+4]}, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      psimd_f32 vt${ABC[N:N+4]} = psimd_qfma_f32(vx${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_hi);
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vt${ABC[N:N+4]} = psimd_qfma_f32(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    $for N in range(0, ELEMENTS_TILE, 4):
+      psimd_f32 vp${ABC[N:N+4]} = psimd_qfma_f32(vc4, vc5, vt${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = psimd_qfma_f32(vc3, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = psimd_qfma_f32(vc2, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = psimd_qfma_f32(vc1, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vt${ABC[N:N+4]} = psimd_mul_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      psimd_f32 vf${ABC[N:N+4]} = psimd_qfma_f32(vs${ABC[N:N+4]}, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vf${ABC[N:N+4]} = psimd_andnotmask_f32(vx${ABC[N:N+4]} < vdenorm_cutoff, vf${ABC[N:N+4]});
+
+    // Store ${ELEMENTS_TILE} (${SIMD_TILE}x4) outputs at a time.
+    psimd_store_f32(output, vf${ABC[0:4]});
+    $for N in range(4, ELEMENTS_TILE, 4):
+      psimd_store_f32(output + ${N}, vf${ABC[N:N+4]});
+    output += ${ELEMENTS_TILE};
+
+    // Accumulate computed exponents.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vacc${N % ACCUMULATORS} = psimd_add_f32(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]});
+  }
+  $if ACCUMULATORS > 1:
+    // Add up all accumulators to vacc0
+    $ACC_SLICE = 1
+    $while ACC_SLICE < ACCUMULATORS:
+      $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+        $if A + ACC_SLICE < ACCUMULATORS:
+          vacc${A} = psimd_add_f32(vacc${A}, vacc${A + ACC_SLICE});
+      $ACC_SLICE *= 2
+
+  psimd_f32 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    // Store 4 outputs at a time.
+    psimd_store_f32(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = psimd_add_f32(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const psimd_f32 vi = psimd_load_f32(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = psimd_sub_f32(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+    vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+    vp = psimd_qfma_f32(vc3, vp, vt);
+    vp = psimd_qfma_f32(vc2, vp, vt);
+    vp = psimd_qfma_f32(vc1, vp, vt);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = psimd_mul_f32(vt, vs);
+    psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      psimd_store2_f32(output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+      vf = psimd_concat_hi_f32(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      psimd_store1_f32(output, vf);
+
+      // Accumulate 1 computed exponent.
+      const psimd_f32 vzero = psimd_zero_f32();
+      vf = psimd_concat_lo_f32(vf, vzero);
+      vf = psimd_concat_even_f32(vf, vzero);
+      vacc = psimd_add_f32(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  *sum = psimd_reduce_sum_f32(vacc);
+}

diff --git a/src/f32-raddstoreexpminusmax/sse2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
new file mode 100644
index 0000000..51a33c8
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/sse2-p5.c.in

@@ -0,0 +1,235 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert ELEMENTS_TILE % 4 == 0
+$assert ELEMENTS_TILE >= 4
+$SIMD_TILE = ELEMENTS_TILE // 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+    size_t elements,
+    const float* input,
+    float* output,
+    float* sum,
+    float max)
+{
+  assert(elements % sizeof(float) == 0);
+
+  const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+  const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+  // Last 7 bits are zeroes
+  const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+  const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+  const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+  const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+  const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+  const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+  const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+  const __m128 vi_max = _mm_set1_ps(max);
+
+  $for K in range(ACCUMULATORS):
+    __m128 vacc${K} = _mm_setzero_ps();
+  for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
+    // Load ${ELEMENTS_TILE} (${SIMD_TILE}x4) inputs at a time.
+    const __m128 vi${ABC[0:4]} = _mm_loadu_ps(input);
+    $for N in range(4, ELEMENTS_TILE, 4):
+      const __m128 vi${ABC[N:N+4]} = _mm_loadu_ps(input + ${N});
+    input += ${ELEMENTS_TILE};
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      const __m128 vx${ABC[N:N+4]} = _mm_sub_ps(vi${ABC[N:N+4]}, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    $for N in range(0, ELEMENTS_TILE, 4):
+      __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vx${ABC[N:N+4]}, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vx${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]});
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    $for N in range(0, ELEMENTS_TILE, 4):
+      __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4);
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3);
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2);
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+    $for N in range(0, ELEMENTS_TILE, 4):
+      __m128 vf${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]});
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vx${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]});
+
+    // Store ${ELEMENTS_TILE} (${SIMD_TILE}x4) outputs at a time.
+    _mm_storeu_ps(output, vf${ABC[0:4]});
+    $for N in range(4, ELEMENTS_TILE, 4):
+      _mm_storeu_ps(output + ${N}, vf${ABC[N:N+4]});
+    output += ${ELEMENTS_TILE};
+
+    // Accumulate computed exponents.
+    $for N in range(0, ELEMENTS_TILE, 4):
+      vacc${N % ACCUMULATORS} = _mm_add_ps(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]});
+  }
+  $if ACCUMULATORS > 1:
+    // Add up all accumulators to vacc0
+    $ACC_SLICE = 1
+    $while ACC_SLICE < ACCUMULATORS:
+      $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+        $if A + ACC_SLICE < ACCUMULATORS:
+          vacc${A} = _mm_add_ps(vacc${A}, vacc${A + ACC_SLICE});
+      $ACC_SLICE *= 2
+
+  __m128 vacc = vacc0;
+  for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+    input += 4;
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    // Store 4 outputs at a time.
+    _mm_storeu_ps(output, vf);
+    output += 4;
+
+    // Accumulate computed exponents.
+    vacc = _mm_add_ps(vacc, vf);
+  }
+  if (elements != 0) {
+    assert(elements >= 1 * sizeof(float));
+    assert(elements <= 3 * sizeof(float));
+    // Load 4 inputs at a time.
+    const __m128 vi = _mm_loadu_ps(input);
+
+    // Subtract maximum input x := i - i_max. This implies x <= 0.
+    const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+    // Compute reduced argument elements := round(x / log(2)).
+    __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+    const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+    // Subtract the large number back to get final elements := round(x / log(2)).
+    vn = _mm_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - elements * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+    vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+    vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm_mul_ps(vt, vs);
+    __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+    // For inputs below zero cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+    if (elements & (2 * sizeof(float))) {
+      // Store 2 outputs at a time.
+      _mm_storel_pi((__m64*) output, vf);
+      output += 2;
+
+      // Accumulate 2 computed exponents.
+      vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+      vf = _mm_movehl_ps(vf, vf);
+    }
+    if (elements & (1 * sizeof(float))) {
+      // Store 1 output at a time.
+      _mm_store_ss(output, vf);
+
+      // Accumulate 1 computed exponent.
+      vacc = _mm_add_ss(vacc, vf);
+    }
+  }
+  // Reduce 4 elements in the SIMD register
+  vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+  vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+  _mm_store_ss(sum, vacc);
+}

diff --git a/src/f32-rmax/psimd.c b/src/f32-rmax/psimd.c
new file mode 100644
index 0000000..74afbf2
--- /dev/null
+++ b/src/f32-rmax/psimd.c

@@ -0,0 +1,53 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/rmax.h>
+
+
+void xnn_f32_rmax_ukernel__psimd(
+    size_t n,
+    const float* x,
+    float* y)
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  psimd_f32 vmax0 = psimd_load_splat_f32(x);
+  psimd_f32 vmax1 = vmax0;
+  psimd_f32 vmax2 = vmax0;
+  psimd_f32 vmax3 = vmax0;
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const psimd_f32 vx0 = psimd_load_f32(x);
+    const psimd_f32 vx1 = psimd_load_f32(x + 4);
+    const psimd_f32 vx2 = psimd_load_f32(x + 8);
+    const psimd_f32 vx3 = psimd_load_f32(x + 12);
+    x += 16;
+
+    vmax0 = psimd_max_f32(vmax0, vx0);
+    vmax1 = psimd_max_f32(vmax1, vx1);
+    vmax2 = psimd_max_f32(vmax2, vx2);
+    vmax3 = psimd_max_f32(vmax3, vx3);
+  }
+  psimd_f32 vmax0123 = psimd_max_f32(psimd_max_f32(vmax0, vmax1), psimd_max_f32(vmax2, vmax3));
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(x);
+    vmax0123 = psimd_max_f32(vmax0123, vx);
+    x += 4;
+  }
+  float vmax = psimd_reduce_max_f32(vmax0123);
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float vx = *x++;
+      vmax = math_max_f32(vx, vmax);
+      n -= 4;
+    } while (n != 0);
+  }
+  *y = vmax;
+}

diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h
index ca02584..5b2c36b 100644
--- a/src/xnnpack/raddstoreexpminusmax.h
+++ b/src/xnnpack/raddstoreexpminusmax.h

@@ -23,6 +23,19 @@
       float* sum,                                                  \
       float max);
 
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5)
+
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64)
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2)
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4)
@@ -49,6 +62,19 @@
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3)
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6)
 
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5)
+
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x1)
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2)
 DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2)

diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
index 0dc1996..0f0ee71 100644
--- a/src/xnnpack/rmax.h
+++ b/src/xnnpack/rmax.h

@@ -24,11 +24,12 @@
       const float* x,                              \
       float* y);
 
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
 DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx)
 DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx512f)
-DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__psimd)
 DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__scalar)
-DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
 
 
 #define DECLARE_U8_RMAX_UKERNEL_FUNCTION(fn_name) \
@@ -38,8 +39,8 @@
       uint8_t* y);
 
 DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__neon)
-DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
 DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__sse2)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
 
 
 #ifdef __cplusplus
commit	b39689d3fada469454a634e86d629d07fc21eb25	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Fri Jan 24 13:32:20 2020 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Fri Jan 24 13:32:55 2020 -0800
tree	464811ca5a878c4ffd2fd03da35715b7ba97a116
parent	f46f67559607339870b77d210cae2f2434ed8f64 [diff]