SSE2/PSIMD RAddStoreExpMinusMax micro-kernels
PiperOrigin-RevId: 291432270
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
new file mode 100644
index 0000000..8ff55cf
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
@@ -0,0 +1,244 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
new file mode 100644
index 0000000..89f7a49
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
@@ -0,0 +1,246 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ psimd_f32 vacc2 = psimd_zero_f32();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc1 = psimd_add_f32(vacc1, vf4567);
+ vacc2 = psimd_add_f32(vacc2, vf89AB);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+ vacc0 = psimd_add_f32(vacc0, vacc2);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
new file mode 100644
index 0000000..b517915
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
@@ -0,0 +1,241 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ }
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
new file mode 100644
index 0000000..38db010
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
@@ -0,0 +1,260 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ vacc0 = psimd_add_f32(vacc0, vfCDEF);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
new file mode 100644
index 0000000..12882c1
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
@@ -0,0 +1,264 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ psimd_f32 vacc2 = psimd_zero_f32();
+ psimd_f32 vacc3 = psimd_zero_f32();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ vacc0 = psimd_add_f32(vacc0, vfCDEF);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+ vacc2 = psimd_add_f32(vacc2, vacc3);
+ vacc0 = psimd_add_f32(vacc0, vacc2);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
new file mode 100644
index 0000000..8329c65
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
@@ -0,0 +1,257 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ vacc0 = psimd_add_f32(vacc0, vfCDEF);
+ }
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
new file mode 100644
index 0000000..939c2a5
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
@@ -0,0 +1,276 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+ const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+ psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+ const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+ vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+ psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+ vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+ psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+ vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+ psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+ vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ psimd_store_f32(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ vacc0 = psimd_add_f32(vacc0, vfCDEF);
+ vacc0 = psimd_add_f32(vacc0, vfGHIJ);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
new file mode 100644
index 0000000..c037620
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
@@ -0,0 +1,282 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ psimd_f32 vacc2 = psimd_zero_f32();
+ psimd_f32 vacc3 = psimd_zero_f32();
+ psimd_f32 vacc4 = psimd_zero_f32();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+ const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+ psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+ const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+ vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+ psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+ vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+ psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+ vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+ psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+ vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ psimd_store_f32(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc4 = psimd_add_f32(vacc4, vf4567);
+ vacc3 = psimd_add_f32(vacc3, vf89AB);
+ vacc2 = psimd_add_f32(vacc2, vfCDEF);
+ vacc1 = psimd_add_f32(vacc1, vfGHIJ);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+ vacc2 = psimd_add_f32(vacc2, vacc3);
+ vacc0 = psimd_add_f32(vacc0, vacc2);
+ vacc0 = psimd_add_f32(vacc0, vacc4);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
new file mode 100644
index 0000000..5b80fa0
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
@@ -0,0 +1,273 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ const psimd_f32 vi89AB = psimd_load_f32(input + 8);
+ const psimd_f32 viCDEF = psimd_load_f32(input + 12);
+ const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+ const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
+ const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
+ const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+ psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
+ psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
+ psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+ const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
+ const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
+ const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+ vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
+ vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
+ vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+ psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
+ psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
+ psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+ vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
+ vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
+ vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+ psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
+ psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
+ psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+ vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
+ vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
+ vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+ vt89AB = psimd_mul_f32(vt89AB, vs89AB);
+ vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
+ vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+ psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
+ psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
+ psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+ vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
+ vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
+ vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ psimd_store_f32(output + 8, vf89AB);
+ psimd_store_f32(output + 12, vfCDEF);
+ psimd_store_f32(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ vacc0 = psimd_add_f32(vacc0, vf89AB);
+ vacc0 = psimd_add_f32(vacc0, vfCDEF);
+ vacc0 = psimd_add_f32(vacc0, vfGHIJ);
+ }
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
new file mode 100644
index 0000000..777837d
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 (1x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+
+ // Store 4 (1x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ }
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
new file mode 100644
index 0000000..2132ed3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
@@ -0,0 +1,228 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ psimd_f32 vacc1 = psimd_zero_f32();
+ for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+ // Load 8 (2x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ input += 8;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+
+ // Store 8 (2x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ output += 8;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = psimd_add_f32(vacc0, vacc1);
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
new file mode 100644
index 0000000..06a6a75
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ psimd_f32 vacc0 = psimd_zero_f32();
+ for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+ // Load 8 (2x4) inputs at a time.
+ const psimd_f32 vi0123 = psimd_load_f32(input);
+ const psimd_f32 vi4567 = psimd_load_f32(input + 4);
+ input += 8;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
+ const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
+ psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
+ const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
+ vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
+ psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
+
+ vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
+ vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
+ psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
+
+ vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
+ vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = psimd_mul_f32(vt0123, vs0123);
+ vt4567 = psimd_mul_f32(vt4567, vs4567);
+
+ psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
+ psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
+ vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
+
+ // Store 8 (2x4) outputs at a time.
+ psimd_store_f32(output, vf0123);
+ psimd_store_f32(output + 4, vf4567);
+ output += 8;
+
+ // Accumulate computed exponents.
+ vacc0 = psimd_add_f32(vacc0, vf0123);
+ vacc0 = psimd_add_f32(vacc0, vf4567);
+ }
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
new file mode 100644
index 0000000..ae36a76
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc2.c
@@ -0,0 +1,243 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
new file mode 100644
index 0000000..9761b8f
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12-acc3.c
@@ -0,0 +1,245 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ __m128 vacc2 = _mm_setzero_ps();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc1 = _mm_add_ps(vacc1, vf4567);
+ vacc2 = _mm_add_ps(vacc2, vf89AB);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+ vacc0 = _mm_add_ps(vacc0, vacc2);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
new file mode 100644
index 0000000..9f8cf69
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x12.c
@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
+ // Load 12 (3x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ input += 12;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+
+ // Store 12 (3x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ output += 12;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ }
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
new file mode 100644
index 0000000..2380227
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc2.c
@@ -0,0 +1,259 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ vacc0 = _mm_add_ps(vacc0, vfCDEF);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
new file mode 100644
index 0000000..1e60b65
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16-acc4.c
@@ -0,0 +1,263 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ __m128 vacc2 = _mm_setzero_ps();
+ __m128 vacc3 = _mm_setzero_ps();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ vacc0 = _mm_add_ps(vacc0, vfCDEF);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+ vacc2 = _mm_add_ps(vacc2, vacc3);
+ vacc0 = _mm_add_ps(vacc0, vacc2);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
new file mode 100644
index 0000000..09f542f
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x16.c
@@ -0,0 +1,256 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
+ // Load 16 (4x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ input += 16;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+
+ // Store 16 (4x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ output += 16;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ vacc0 = _mm_add_ps(vacc0, vfCDEF);
+ }
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
new file mode 100644
index 0000000..f92b791
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
@@ -0,0 +1,275 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+ const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+ __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+ const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+ vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+ __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+ vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+ __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+ vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+ __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+ vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ _mm_storeu_ps(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ vacc0 = _mm_add_ps(vacc0, vfCDEF);
+ vacc0 = _mm_add_ps(vacc0, vfGHIJ);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
new file mode 100644
index 0000000..d58661b
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc5.c
@@ -0,0 +1,281 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ __m128 vacc2 = _mm_setzero_ps();
+ __m128 vacc3 = _mm_setzero_ps();
+ __m128 vacc4 = _mm_setzero_ps();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+ const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+ __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+ const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+ vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+ __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+ vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+ __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+ vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+ __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+ vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ _mm_storeu_ps(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc4 = _mm_add_ps(vacc4, vf4567);
+ vacc3 = _mm_add_ps(vacc3, vf89AB);
+ vacc2 = _mm_add_ps(vacc2, vfCDEF);
+ vacc1 = _mm_add_ps(vacc1, vfGHIJ);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+ vacc2 = _mm_add_ps(vacc2, vacc3);
+ vacc0 = _mm_add_ps(vacc0, vacc2);
+ vacc0 = _mm_add_ps(vacc0, vacc4);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
new file mode 100644
index 0000000..3ab5db3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x20.c
@@ -0,0 +1,272 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
+ // Load 20 (5x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ const __m128 vi89AB = _mm_loadu_ps(input + 8);
+ const __m128 viCDEF = _mm_loadu_ps(input + 12);
+ const __m128 viGHIJ = _mm_loadu_ps(input + 16);
+ input += 20;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+ const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
+ const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
+ const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+ __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
+ __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
+ __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+ const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
+ const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
+ const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+ vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
+ vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
+ vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+ __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
+ __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
+ __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+ vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
+ vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
+ vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+ __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
+ __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
+ __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+ vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
+ vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
+ vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+ vt89AB = _mm_mul_ps(vt89AB, vs89AB);
+ vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
+ vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+ __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
+ __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
+ __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+ vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
+ vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
+ vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
+
+ // Store 20 (5x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ _mm_storeu_ps(output + 8, vf89AB);
+ _mm_storeu_ps(output + 12, vfCDEF);
+ _mm_storeu_ps(output + 16, vfGHIJ);
+ output += 20;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ vacc0 = _mm_add_ps(vacc0, vf89AB);
+ vacc0 = _mm_add_ps(vacc0, vfCDEF);
+ vacc0 = _mm_add_ps(vacc0, vfGHIJ);
+ }
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
new file mode 100644
index 0000000..157b576
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c
@@ -0,0 +1,208 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 (1x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+
+ // Store 4 (1x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ }
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
new file mode 100644
index 0000000..d3ef0b3
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8-acc2.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ __m128 vacc1 = _mm_setzero_ps();
+ for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+ // Load 8 (2x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ input += 8;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+
+ // Store 8 (2x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ output += 8;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ }
+ // Add up all accumulators to vacc0
+ vacc0 = _mm_add_ps(vacc0, vacc1);
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
new file mode 100644
index 0000000..0460e42
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/gen/sse2-p5-x8.c
@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ __m128 vacc0 = _mm_setzero_ps();
+ for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
+ // Load 8 (2x4) inputs at a time.
+ const __m128 vi0123 = _mm_loadu_ps(input);
+ const __m128 vi4567 = _mm_loadu_ps(input + 4);
+ input += 8;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
+ const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
+ __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
+ const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
+ vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
+ __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
+
+ vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
+ vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
+ __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
+
+ vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
+ vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt0123 = _mm_mul_ps(vt0123, vs0123);
+ vt4567 = _mm_mul_ps(vt4567, vs4567);
+
+ __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
+ __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
+ vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
+
+ // Store 8 (2x4) outputs at a time.
+ _mm_storeu_ps(output, vf0123);
+ _mm_storeu_ps(output + 4, vf4567);
+ output += 8;
+
+ // Accumulate computed exponents.
+ vacc0 = _mm_add_ps(vacc0, vf0123);
+ vacc0 = _mm_add_ps(vacc0, vf4567);
+ }
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/psimd-p5.c.in b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
new file mode 100644
index 0000000..7cba991
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
@@ -0,0 +1,236 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert ELEMENTS_TILE % 4 == 0
+$assert ELEMENTS_TILE >= 4
+$SIMD_TILE = ELEMENTS_TILE // 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
+ const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
+ const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
+
+ const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
+ const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
+ const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
+ const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
+ const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
+
+ const psimd_f32 vi_max = psimd_splat_f32(max);
+
+ $for K in range(ACCUMULATORS):
+ psimd_f32 vacc${K} = psimd_zero_f32();
+ for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
+ // Load ${ELEMENTS_TILE} (${SIMD_TILE}x4) inputs at a time.
+ const psimd_f32 vi${ABC[0:4]} = psimd_load_f32(input);
+ $for N in range(4, ELEMENTS_TILE, 4):
+ const psimd_f32 vi${ABC[N:N+4]} = psimd_load_f32(input + ${N});
+ input += ${ELEMENTS_TILE};
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ const psimd_f32 vx${ABC[N:N+4]} = psimd_sub_f32(vi${ABC[N:N+4]}, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ $for N in range(0, ELEMENTS_TILE, 4):
+ psimd_f32 vn${ABC[N:N+4]} = psimd_qfma_f32(vmagic_bias, vx${ABC[N:N+4]}, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ const psimd_f32 vs${ABC[N:N+4]} = (psimd_f32) ((psimd_u32) vn${ABC[N:N+4]} << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vn${ABC[N:N+4]} = psimd_sub_f32(vn${ABC[N:N+4]}, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ psimd_f32 vt${ABC[N:N+4]} = psimd_qfma_f32(vx${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_hi);
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vt${ABC[N:N+4]} = psimd_qfma_f32(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ $for N in range(0, ELEMENTS_TILE, 4):
+ psimd_f32 vp${ABC[N:N+4]} = psimd_qfma_f32(vc4, vc5, vt${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = psimd_qfma_f32(vc3, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = psimd_qfma_f32(vc2, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = psimd_qfma_f32(vc1, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vt${ABC[N:N+4]} = psimd_mul_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ psimd_f32 vf${ABC[N:N+4]} = psimd_qfma_f32(vs${ABC[N:N+4]}, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vf${ABC[N:N+4]} = psimd_andnotmask_f32(vx${ABC[N:N+4]} < vdenorm_cutoff, vf${ABC[N:N+4]});
+
+ // Store ${ELEMENTS_TILE} (${SIMD_TILE}x4) outputs at a time.
+ psimd_store_f32(output, vf${ABC[0:4]});
+ $for N in range(4, ELEMENTS_TILE, 4):
+ psimd_store_f32(output + ${N}, vf${ABC[N:N+4]});
+ output += ${ELEMENTS_TILE};
+
+ // Accumulate computed exponents.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vacc${N % ACCUMULATORS} = psimd_add_f32(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]});
+ }
+ $if ACCUMULATORS > 1:
+ // Add up all accumulators to vacc0
+ $ACC_SLICE = 1
+ $while ACC_SLICE < ACCUMULATORS:
+ $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+ $if A + ACC_SLICE < ACCUMULATORS:
+ vacc${A} = psimd_add_f32(vacc${A}, vacc${A + ACC_SLICE});
+ $ACC_SLICE *= 2
+
+ psimd_f32 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ // Store 4 outputs at a time.
+ psimd_store_f32(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const psimd_f32 vi = psimd_load_f32(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = psimd_sub_f32(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
+ vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
+ vp = psimd_qfma_f32(vc3, vp, vt);
+ vp = psimd_qfma_f32(vc2, vp, vt);
+ vp = psimd_qfma_f32(vc1, vp, vt);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = psimd_mul_f32(vt, vs);
+ psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ psimd_store2_f32(output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
+
+ vf = psimd_concat_hi_f32(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ psimd_store1_f32(output, vf);
+
+ // Accumulate 1 computed exponent.
+ const psimd_f32 vzero = psimd_zero_f32();
+ vf = psimd_concat_lo_f32(vf, vzero);
+ vf = psimd_concat_even_f32(vf, vzero);
+ vacc = psimd_add_f32(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ *sum = psimd_reduce_sum_f32(vacc);
+}
diff --git a/src/f32-raddstoreexpminusmax/sse2-p5.c.in b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
new file mode 100644
index 0000000..51a33c8
--- /dev/null
+++ b/src/f32-raddstoreexpminusmax/sse2-p5.c.in
@@ -0,0 +1,235 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert ELEMENTS_TILE % 4 == 0
+$assert ELEMENTS_TILE >= 4
+$SIMD_TILE = ELEMENTS_TILE // 4
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/raddstoreexpminusmax.h>
+
+
+void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+ size_t elements,
+ const float* input,
+ float* output,
+ float* sum,
+ float max)
+{
+ assert(elements % sizeof(float) == 0);
+
+ const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
+ const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
+ // Last 7 bits are zeroes
+ const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
+ const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
+
+ const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
+ const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
+ const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
+ const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
+ const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
+
+ const __m128 vi_max = _mm_set1_ps(max);
+
+ $for K in range(ACCUMULATORS):
+ __m128 vacc${K} = _mm_setzero_ps();
+ for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
+ // Load ${ELEMENTS_TILE} (${SIMD_TILE}x4) inputs at a time.
+ const __m128 vi${ABC[0:4]} = _mm_loadu_ps(input);
+ $for N in range(4, ELEMENTS_TILE, 4):
+ const __m128 vi${ABC[N:N+4]} = _mm_loadu_ps(input + ${N});
+ input += ${ELEMENTS_TILE};
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ const __m128 vx${ABC[N:N+4]} = _mm_sub_ps(vi${ABC[N:N+4]}, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ $for N in range(0, ELEMENTS_TILE, 4):
+ __m128 vn${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vx${ABC[N:N+4]}, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ const __m128 vs${ABC[N:N+4]} = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn${ABC[N:N+4]}), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vn${ABC[N:N+4]} = _mm_sub_ps(vn${ABC[N:N+4]}, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ __m128 vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_hi), vx${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vt${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vn${ABC[N:N+4]}, vminus_ln2_lo), vt${ABC[N:N+4]});
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ $for N in range(0, ELEMENTS_TILE, 4):
+ __m128 vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vc5, vt${ABC[N:N+4]}), vc4);
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc3);
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc2);
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vp${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vp${ABC[N:N+4]}, vt${ABC[N:N+4]}), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vt${ABC[N:N+4]} = _mm_mul_ps(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
+
+ $for N in range(0, ELEMENTS_TILE, 4):
+ __m128 vf${ABC[N:N+4]} = _mm_add_ps(_mm_mul_ps(vt${ABC[N:N+4]}, vp${ABC[N:N+4]}), vs${ABC[N:N+4]});
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vf${ABC[N:N+4]} = _mm_andnot_ps(_mm_cmplt_ps(vx${ABC[N:N+4]}, vdenorm_cutoff), vf${ABC[N:N+4]});
+
+ // Store ${ELEMENTS_TILE} (${SIMD_TILE}x4) outputs at a time.
+ _mm_storeu_ps(output, vf${ABC[0:4]});
+ $for N in range(4, ELEMENTS_TILE, 4):
+ _mm_storeu_ps(output + ${N}, vf${ABC[N:N+4]});
+ output += ${ELEMENTS_TILE};
+
+ // Accumulate computed exponents.
+ $for N in range(0, ELEMENTS_TILE, 4):
+ vacc${N % ACCUMULATORS} = _mm_add_ps(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]});
+ }
+ $if ACCUMULATORS > 1:
+ // Add up all accumulators to vacc0
+ $ACC_SLICE = 1
+ $while ACC_SLICE < ACCUMULATORS:
+ $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+ $if A + ACC_SLICE < ACCUMULATORS:
+ vacc${A} = _mm_add_ps(vacc${A}, vacc${A + ACC_SLICE});
+ $ACC_SLICE *= 2
+
+ __m128 vacc = vacc0;
+ for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+ input += 4;
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ // Store 4 outputs at a time.
+ _mm_storeu_ps(output, vf);
+ output += 4;
+
+ // Accumulate computed exponents.
+ vacc = _mm_add_ps(vacc, vf);
+ }
+ if (elements != 0) {
+ assert(elements >= 1 * sizeof(float));
+ assert(elements <= 3 * sizeof(float));
+ // Load 4 inputs at a time.
+ const __m128 vi = _mm_loadu_ps(input);
+
+ // Subtract maximum input x := i - i_max. This implies x <= 0.
+ const __m128 vx = _mm_sub_ps(vi, vi_max);
+
+ // Compute reduced argument elements := round(x / log(2)).
+ __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
+ const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
+
+ // Subtract the large number back to get final elements := round(x / log(2)).
+ vn = _mm_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - elements * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
+ vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
+ vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm_mul_ps(vt, vs);
+ __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
+
+ // For inputs below zero cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
+
+ if (elements & (2 * sizeof(float))) {
+ // Store 2 outputs at a time.
+ _mm_storel_pi((__m64*) output, vf);
+ output += 2;
+
+ // Accumulate 2 computed exponents.
+ vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
+
+ vf = _mm_movehl_ps(vf, vf);
+ }
+ if (elements & (1 * sizeof(float))) {
+ // Store 1 output at a time.
+ _mm_store_ss(output, vf);
+
+ // Accumulate 1 computed exponent.
+ vacc = _mm_add_ss(vacc, vf);
+ }
+ }
+ // Reduce 4 elements in the SIMD register
+ vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
+ vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
+ _mm_store_ss(sum, vacc);
+}
diff --git a/src/f32-rmax/psimd.c b/src/f32-rmax/psimd.c
new file mode 100644
index 0000000..74afbf2
--- /dev/null
+++ b/src/f32-rmax/psimd.c
@@ -0,0 +1,53 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/rmax.h>
+
+
+void xnn_f32_rmax_ukernel__psimd(
+ size_t n,
+ const float* x,
+ float* y)
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ psimd_f32 vmax0 = psimd_load_splat_f32(x);
+ psimd_f32 vmax1 = vmax0;
+ psimd_f32 vmax2 = vmax0;
+ psimd_f32 vmax3 = vmax0;
+ for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+ const psimd_f32 vx0 = psimd_load_f32(x);
+ const psimd_f32 vx1 = psimd_load_f32(x + 4);
+ const psimd_f32 vx2 = psimd_load_f32(x + 8);
+ const psimd_f32 vx3 = psimd_load_f32(x + 12);
+ x += 16;
+
+ vmax0 = psimd_max_f32(vmax0, vx0);
+ vmax1 = psimd_max_f32(vmax1, vx1);
+ vmax2 = psimd_max_f32(vmax2, vx2);
+ vmax3 = psimd_max_f32(vmax3, vx3);
+ }
+ psimd_f32 vmax0123 = psimd_max_f32(psimd_max_f32(vmax0, vmax1), psimd_max_f32(vmax2, vmax3));
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ const psimd_f32 vx = psimd_load_f32(x);
+ vmax0123 = psimd_max_f32(vmax0123, vx);
+ x += 4;
+ }
+ float vmax = psimd_reduce_max_f32(vmax0123);
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const float vx = *x++;
+ vmax = math_max_f32(vx, vmax);
+ n -= 4;
+ } while (n != 0);
+ }
+ *y = vmax;
+}
diff --git a/src/xnnpack/raddstoreexpminusmax.h b/src/xnnpack/raddstoreexpminusmax.h
index ca02584..5b2c36b 100644
--- a/src/xnnpack/raddstoreexpminusmax.h
+++ b/src/xnnpack/raddstoreexpminusmax.h
@@ -23,6 +23,19 @@
float* sum, \
float max);
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x8_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x12_acc3)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x16_acc4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5)
+
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64)
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2)
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc4)
@@ -49,6 +62,19 @@
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3)
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2)
+DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5)
+
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x1)
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2)
DECLARE_F32_RADDSTOREEXPMINUSMAX_UKERNEL_FUNCTION(xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x2_acc2)
diff --git a/src/xnnpack/rmax.h b/src/xnnpack/rmax.h
index 0dc1996..0f0ee71 100644
--- a/src/xnnpack/rmax.h
+++ b/src/xnnpack/rmax.h
@@ -24,11 +24,12 @@
const float* x, \
float* y);
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx)
DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__avx512f)
-DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__neon)
+DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__psimd)
DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__scalar)
-DECLARE_F32_RMAX_UKERNEL_FUNCTION(xnn_f32_rmax_ukernel__sse)
#define DECLARE_U8_RMAX_UKERNEL_FUNCTION(fn_name) \
@@ -38,8 +39,8 @@
uint8_t* y);
DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__neon)
-DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__sse2)
+DECLARE_U8_RMAX_UKERNEL_FUNCTION(xnn_u8_rmax_ukernel__scalar)
#ifdef __cplusplus