Remove PSIMD variants of RADDSTOREEXPMINUSMAX microkernels
PiperOrigin-RevId: 321850222
diff --git a/BUILD.bazel b/BUILD.bazel
index ed091dc..4e36cbd 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -866,23 +866,11 @@
"src/x32-zip/x3-psimd.c",
"src/x32-zip/x4-psimd.c",
"src/x32-zip/xm-psimd.c",
- "src/requantization/precise-psimd.c",
- "src/requantization/fp32-psimd.c",
]
PSIMD_ACCMATH_UKERNELS = [
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c",
- "src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c",
+ "src/requantization/precise-psimd.c",
+ "src/requantization/fp32-psimd.c",
]
# ISA-specific micro-kernels
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b82b96c..7cf45e6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -546,22 +546,10 @@
src/x32-zip/x3-psimd.c
src/x32-zip/x4-psimd.c
src/x32-zip/xm-psimd.c
- src/requantization/precise-psimd.c
- src/requantization/fp32-psimd.c)
SET(XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
- src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c)
+ src/requantization/precise-psimd.c
+ src/requantization/fp32-psimd.c)
SET(XNNPACK_NEON_MICROKERNEL_SRCS
src/f32-avgpool/9p8x-minmax-neon-c4.c
diff --git a/bench/f32-raddstoreexpminusmax.cc b/bench/f32-raddstoreexpminusmax.cc
index d665a5d..a342c36 100644
--- a/bench/f32-raddstoreexpminusmax.cc
+++ b/bench/f32-raddstoreexpminusmax.cc
@@ -414,45 +414,6 @@
xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x4,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x8,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x8_acc2,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12_acc2,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x12_acc3,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16_acc2,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x16_acc4,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20_acc2,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2)->Apply(CharacteristicArguments)->UseRealTime();
- BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, psimd_p5_x20_acc5,
- xnn_f32_rmax_ukernel__psimd,
- xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5)->Apply(CharacteristicArguments)->UseRealTime();
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
#if XNN_ARCH_WASMSIMD
BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_p5_x4,
xnn_f32_rmax_ukernel__wasmsimd_arm,
diff --git a/scripts/generate-f32-raddstoreexpminusmax.sh b/scripts/generate-f32-raddstoreexpminusmax.sh
index 44d0d12..97be07b 100755
--- a/scripts/generate-f32-raddstoreexpminusmax.sh
+++ b/scripts/generate-f32-raddstoreexpminusmax.sh
@@ -99,20 +99,6 @@
tools/xngen src/f32-raddstoreexpminusmax/avx512f-p5-scalef.c.in -D ELEMENTS_TILE=192 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/avx512f-p5-scalef-x192-acc3.c
tools/xngen src/f32-raddstoreexpminusmax/avx512f-p5-scalef.c.in -D ELEMENTS_TILE=192 -D ACCUMULATORS=6 -o src/f32-raddstoreexpminusmax/gen/avx512f-p5-scalef-x192-acc6.c
-#################################### PSIMD ####################################
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=4 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=8 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=8 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=12 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=12 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=12 -D ACCUMULATORS=3 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=16 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=16 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=16 -D ACCUMULATORS=4 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=20 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=20 -D ACCUMULATORS=2 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
-tools/xngen src/f32-raddstoreexpminusmax/psimd-p5.c.in -D ELEMENTS_TILE=20 -D ACCUMULATORS=5 -o src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
-
################################## WAsm SIMD ##################################
tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-p5.c.in -D ELEMENTS_TILE=4 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x4.c
tools/xngen src/f32-raddstoreexpminusmax/wasmsimd-p5.c.in -D ELEMENTS_TILE=8 -D ACCUMULATORS=1 -o src/f32-raddstoreexpminusmax/gen/wasmsimd-p5-x8.c
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
deleted file mode 100644
index 50edd00..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc2.c
+++ /dev/null
@@ -1,244 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
- // Load 12 (3x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- input += 12;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
-
- // Store 12 (3x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- output += 12;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
deleted file mode 100644
index ede5c6c..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12-acc3.c
+++ /dev/null
@@ -1,246 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- psimd_f32 vacc2 = psimd_zero_f32();
- for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
- // Load 12 (3x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- input += 12;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
-
- // Store 12 (3x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- output += 12;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc1 = psimd_add_f32(vacc1, vf4567);
- vacc2 = psimd_add_f32(vacc2, vf89AB);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
- vacc0 = psimd_add_f32(vacc0, vacc2);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
deleted file mode 100644
index e8e2df9..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x12.c
+++ /dev/null
@@ -1,241 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- for (; elements >= 12 * sizeof(float); elements -= 12 * sizeof(float)) {
- // Load 12 (3x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- input += 12;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
-
- // Store 12 (3x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- output += 12;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- }
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
deleted file mode 100644
index 4df8338..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc2.c
+++ /dev/null
@@ -1,260 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
- // Load 16 (4x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- input += 16;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
-
- // Store 16 (4x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- output += 16;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- vacc0 = psimd_add_f32(vacc0, vfCDEF);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
deleted file mode 100644
index 8ac5fb8..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16-acc4.c
+++ /dev/null
@@ -1,264 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- psimd_f32 vacc2 = psimd_zero_f32();
- psimd_f32 vacc3 = psimd_zero_f32();
- for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
- // Load 16 (4x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- input += 16;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
-
- // Store 16 (4x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- output += 16;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- vacc0 = psimd_add_f32(vacc0, vfCDEF);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
- vacc2 = psimd_add_f32(vacc2, vacc3);
- vacc0 = psimd_add_f32(vacc0, vacc2);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
deleted file mode 100644
index 7751927..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x16.c
+++ /dev/null
@@ -1,257 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- for (; elements >= 16 * sizeof(float); elements -= 16 * sizeof(float)) {
- // Load 16 (4x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- input += 16;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
-
- // Store 16 (4x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- output += 16;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- vacc0 = psimd_add_f32(vacc0, vfCDEF);
- }
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
deleted file mode 100644
index 4231d00..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc2.c
+++ /dev/null
@@ -1,276 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
- // Load 20 (5x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
- input += 20;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
- const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
- psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
- const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
- vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
- psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
- vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
- psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
- vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
- psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
- vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
-
- // Store 20 (5x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- psimd_store_f32(output + 16, vfGHIJ);
- output += 20;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- vacc0 = psimd_add_f32(vacc0, vfCDEF);
- vacc0 = psimd_add_f32(vacc0, vfGHIJ);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
deleted file mode 100644
index fc7327b..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20-acc5.c
+++ /dev/null
@@ -1,282 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- psimd_f32 vacc2 = psimd_zero_f32();
- psimd_f32 vacc3 = psimd_zero_f32();
- psimd_f32 vacc4 = psimd_zero_f32();
- for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
- // Load 20 (5x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
- input += 20;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
- const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
- psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
- const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
- vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
- psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
- vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
- psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
- vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
- psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
- vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
-
- // Store 20 (5x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- psimd_store_f32(output + 16, vfGHIJ);
- output += 20;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc4 = psimd_add_f32(vacc4, vf4567);
- vacc3 = psimd_add_f32(vacc3, vf89AB);
- vacc2 = psimd_add_f32(vacc2, vfCDEF);
- vacc1 = psimd_add_f32(vacc1, vfGHIJ);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
- vacc2 = psimd_add_f32(vacc2, vacc3);
- vacc0 = psimd_add_f32(vacc0, vacc2);
- vacc0 = psimd_add_f32(vacc0, vacc4);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
deleted file mode 100644
index 259375c..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x20.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
- // Load 20 (5x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- const psimd_f32 vi89AB = psimd_load_f32(input + 8);
- const psimd_f32 viCDEF = psimd_load_f32(input + 12);
- const psimd_f32 viGHIJ = psimd_load_f32(input + 16);
- input += 20;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
- const psimd_f32 vx89AB = psimd_sub_f32(vi89AB, vi_max);
- const psimd_f32 vxCDEF = psimd_sub_f32(viCDEF, vi_max);
- const psimd_f32 vxGHIJ = psimd_sub_f32(viGHIJ, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
- psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vx89AB, vlog2e);
- psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vxCDEF, vlog2e);
- psimd_f32 vnGHIJ = psimd_qfma_f32(vmagic_bias, vxGHIJ, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
- const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
- const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
- const psimd_f32 vsGHIJ = (psimd_f32) ((psimd_u32) vnGHIJ << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
- vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
- vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
- vnGHIJ = psimd_sub_f32(vnGHIJ, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
- psimd_f32 vt89AB = psimd_qfma_f32(vx89AB, vn89AB, vminus_ln2_hi);
- psimd_f32 vtCDEF = psimd_qfma_f32(vxCDEF, vnCDEF, vminus_ln2_hi);
- psimd_f32 vtGHIJ = psimd_qfma_f32(vxGHIJ, vnGHIJ, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
- vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vminus_ln2_lo);
- vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vminus_ln2_lo);
- vtGHIJ = psimd_qfma_f32(vtGHIJ, vnGHIJ, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
- psimd_f32 vp89AB = psimd_qfma_f32(vc4, vc5, vt89AB);
- psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vc5, vtCDEF);
- psimd_f32 vpGHIJ = psimd_qfma_f32(vc4, vc5, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc3, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc3, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc3, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc2, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc2, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc2, vpGHIJ, vtGHIJ);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
- vp89AB = psimd_qfma_f32(vc1, vp89AB, vt89AB);
- vpCDEF = psimd_qfma_f32(vc1, vpCDEF, vtCDEF);
- vpGHIJ = psimd_qfma_f32(vc1, vpGHIJ, vtGHIJ);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
- vt89AB = psimd_mul_f32(vt89AB, vs89AB);
- vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
- vtGHIJ = psimd_mul_f32(vtGHIJ, vsGHIJ);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
- psimd_f32 vf89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
- psimd_f32 vfCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
- psimd_f32 vfGHIJ = psimd_qfma_f32(vsGHIJ, vtGHIJ, vpGHIJ);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
- vf89AB = psimd_andnotmask_f32(vx89AB < vdenorm_cutoff, vf89AB);
- vfCDEF = psimd_andnotmask_f32(vxCDEF < vdenorm_cutoff, vfCDEF);
- vfGHIJ = psimd_andnotmask_f32(vxGHIJ < vdenorm_cutoff, vfGHIJ);
-
- // Store 20 (5x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- psimd_store_f32(output + 8, vf89AB);
- psimd_store_f32(output + 12, vfCDEF);
- psimd_store_f32(output + 16, vfGHIJ);
- output += 20;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- vacc0 = psimd_add_f32(vacc0, vf89AB);
- vacc0 = psimd_add_f32(vacc0, vfCDEF);
- vacc0 = psimd_add_f32(vacc0, vfGHIJ);
- }
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
deleted file mode 100644
index f6e5c1c..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x4.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 (1x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
-
- // Store 4 (1x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- output += 4;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- }
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
deleted file mode 100644
index c7958fa..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8-acc2.c
+++ /dev/null
@@ -1,228 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- psimd_f32 vacc1 = psimd_zero_f32();
- for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
- // Load 8 (2x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- input += 8;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
-
- // Store 8 (2x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- output += 8;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- }
- // Add up all accumulators to vacc0
- vacc0 = psimd_add_f32(vacc0, vacc1);
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c b/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
deleted file mode 100644
index e803c5e..0000000
--- a/src/f32-raddstoreexpminusmax/gen/psimd-p5-x8.c
+++ /dev/null
@@ -1,225 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-raddstoreexpminusmax/psimd-p5.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- psimd_f32 vacc0 = psimd_zero_f32();
- for (; elements >= 8 * sizeof(float); elements -= 8 * sizeof(float)) {
- // Load 8 (2x4) inputs at a time.
- const psimd_f32 vi0123 = psimd_load_f32(input);
- const psimd_f32 vi4567 = psimd_load_f32(input + 4);
- input += 8;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx0123 = psimd_sub_f32(vi0123, vi_max);
- const psimd_f32 vx4567 = psimd_sub_f32(vi4567, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vx0123, vlog2e);
- psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vx4567, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
- const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
- vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt0123 = psimd_qfma_f32(vx0123, vn0123, vminus_ln2_hi);
- psimd_f32 vt4567 = psimd_qfma_f32(vx4567, vn4567, vminus_ln2_hi);
-
- vt0123 = psimd_qfma_f32(vt0123, vn0123, vminus_ln2_lo);
- vt4567 = psimd_qfma_f32(vt4567, vn4567, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp0123 = psimd_qfma_f32(vc4, vc5, vt0123);
- psimd_f32 vp4567 = psimd_qfma_f32(vc4, vc5, vt4567);
-
- vp0123 = psimd_qfma_f32(vc3, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc3, vp4567, vt4567);
-
- vp0123 = psimd_qfma_f32(vc2, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc2, vp4567, vt4567);
-
- vp0123 = psimd_qfma_f32(vc1, vp0123, vt0123);
- vp4567 = psimd_qfma_f32(vc1, vp4567, vt4567);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt0123 = psimd_mul_f32(vt0123, vs0123);
- vt4567 = psimd_mul_f32(vt4567, vs4567);
-
- psimd_f32 vf0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
- psimd_f32 vf4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf0123 = psimd_andnotmask_f32(vx0123 < vdenorm_cutoff, vf0123);
- vf4567 = psimd_andnotmask_f32(vx4567 < vdenorm_cutoff, vf4567);
-
- // Store 8 (2x4) outputs at a time.
- psimd_store_f32(output, vf0123);
- psimd_store_f32(output + 4, vf4567);
- output += 8;
-
- // Accumulate computed exponents.
- vacc0 = psimd_add_f32(vacc0, vf0123);
- vacc0 = psimd_add_f32(vacc0, vf4567);
- }
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/src/f32-raddstoreexpminusmax/psimd-p5.c.in b/src/f32-raddstoreexpminusmax/psimd-p5.c.in
deleted file mode 100644
index d7fade0..0000000
--- a/src/f32-raddstoreexpminusmax/psimd-p5.c.in
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert ELEMENTS_TILE % 4 == 0
-$assert ELEMENTS_TILE >= 4
-$SIMD_TILE = ELEMENTS_TILE // 4
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/raddstoreexpminusmax.h>
-
-
-void xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x${ELEMENTS_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
- size_t elements,
- const float* input,
- float* output,
- float* sum,
- float max) XNN_DISABLE_TSAN
-{
- assert(elements % sizeof(float) == 0);
-
- const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
- // The smallest x for which expf(x) is normalized.
- const psimd_f32 vdenorm_cutoff = psimd_splat_f32(-0x1.5D589Ep6f);
- const psimd_f32 vlog2e = psimd_splat_f32(0x1.715476p+0f);
- // Last 7 bits are zeroes
- const psimd_f32 vminus_ln2_hi = psimd_splat_f32(-0x1.62E400p-1f);
- const psimd_f32 vminus_ln2_lo = psimd_splat_f32(-0x1.7F7D1Cp-20f);
-
- const psimd_f32 vc1 = psimd_splat_f32(0x1.FFFFF6p-1f);
- const psimd_f32 vc2 = psimd_splat_f32(0x1.FFFDC6p-2f);
- const psimd_f32 vc3 = psimd_splat_f32(0x1.555A80p-3f);
- const psimd_f32 vc4 = psimd_splat_f32(0x1.573A1Ap-5f);
- const psimd_f32 vc5 = psimd_splat_f32(0x1.0F9F9Cp-7f);
-
- const psimd_f32 vi_max = psimd_splat_f32(max);
-
- $for K in range(ACCUMULATORS):
- psimd_f32 vacc${K} = psimd_zero_f32();
- for (; elements >= ${ELEMENTS_TILE} * sizeof(float); elements -= ${ELEMENTS_TILE} * sizeof(float)) {
- // Load ${ELEMENTS_TILE} (${SIMD_TILE}x4) inputs at a time.
- const psimd_f32 vi${ABC[0:4]} = psimd_load_f32(input);
- $for N in range(4, ELEMENTS_TILE, 4):
- const psimd_f32 vi${ABC[N:N+4]} = psimd_load_f32(input + ${N});
- input += ${ELEMENTS_TILE};
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- $for N in range(0, ELEMENTS_TILE, 4):
- const psimd_f32 vx${ABC[N:N+4]} = psimd_sub_f32(vi${ABC[N:N+4]}, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- $for N in range(0, ELEMENTS_TILE, 4):
- psimd_f32 vn${ABC[N:N+4]} = psimd_qfma_f32(vmagic_bias, vx${ABC[N:N+4]}, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- $for N in range(0, ELEMENTS_TILE, 4):
- const psimd_f32 vs${ABC[N:N+4]} = (psimd_f32) ((psimd_u32) vn${ABC[N:N+4]} << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- $for N in range(0, ELEMENTS_TILE, 4):
- vn${ABC[N:N+4]} = psimd_sub_f32(vn${ABC[N:N+4]}, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- $for N in range(0, ELEMENTS_TILE, 4):
- psimd_f32 vt${ABC[N:N+4]} = psimd_qfma_f32(vx${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_hi);
-
- $for N in range(0, ELEMENTS_TILE, 4):
- vt${ABC[N:N+4]} = psimd_qfma_f32(vt${ABC[N:N+4]}, vn${ABC[N:N+4]}, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- $for N in range(0, ELEMENTS_TILE, 4):
- psimd_f32 vp${ABC[N:N+4]} = psimd_qfma_f32(vc4, vc5, vt${ABC[N:N+4]});
-
- $for N in range(0, ELEMENTS_TILE, 4):
- vp${ABC[N:N+4]} = psimd_qfma_f32(vc3, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
- $for N in range(0, ELEMENTS_TILE, 4):
- vp${ABC[N:N+4]} = psimd_qfma_f32(vc2, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
- $for N in range(0, ELEMENTS_TILE, 4):
- vp${ABC[N:N+4]} = psimd_qfma_f32(vc1, vp${ABC[N:N+4]}, vt${ABC[N:N+4]});
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- $for N in range(0, ELEMENTS_TILE, 4):
- vt${ABC[N:N+4]} = psimd_mul_f32(vt${ABC[N:N+4]}, vs${ABC[N:N+4]});
-
- $for N in range(0, ELEMENTS_TILE, 4):
- psimd_f32 vf${ABC[N:N+4]} = psimd_qfma_f32(vs${ABC[N:N+4]}, vt${ABC[N:N+4]}, vp${ABC[N:N+4]});
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- $for N in range(0, ELEMENTS_TILE, 4):
- vf${ABC[N:N+4]} = psimd_andnotmask_f32(vx${ABC[N:N+4]} < vdenorm_cutoff, vf${ABC[N:N+4]});
-
- // Store ${ELEMENTS_TILE} (${SIMD_TILE}x4) outputs at a time.
- psimd_store_f32(output, vf${ABC[0:4]});
- $for N in range(4, ELEMENTS_TILE, 4):
- psimd_store_f32(output + ${N}, vf${ABC[N:N+4]});
- output += ${ELEMENTS_TILE};
-
- // Accumulate computed exponents.
- $for N in range(0, ELEMENTS_TILE, 4):
- vacc${N % ACCUMULATORS} = psimd_add_f32(vacc${N % ACCUMULATORS}, vf${ABC[N:N+4]});
- }
- $if ACCUMULATORS > 1:
- // Add up all accumulators to vacc0
- $ACC_SLICE = 1
- $while ACC_SLICE < ACCUMULATORS:
- $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
- $if A + ACC_SLICE < ACCUMULATORS:
- vacc${A} = psimd_add_f32(vacc${A}, vacc${A + ACC_SLICE});
- $ACC_SLICE *= 2
-
- psimd_f32 vacc = vacc0;
- for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
- input += 4;
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- // Store 4 outputs at a time.
- psimd_store_f32(output, vf);
- output += 4;
-
- // Accumulate computed exponents.
- vacc = psimd_add_f32(vacc, vf);
- }
- if (elements != 0) {
- assert(elements >= 1 * sizeof(float));
- assert(elements <= 3 * sizeof(float));
- // Load 4 inputs at a time.
- const psimd_f32 vi = psimd_load_f32(input);
-
- // Subtract maximum input x := i - i_max. This implies x <= 0.
- const psimd_f32 vx = psimd_sub_f32(vi, vi_max);
-
- // Compute reduced argument elements := round(x / log(2)).
- psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vx, vlog2e);
-
- // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
- // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
- const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
-
- // Subtract the large number back to get final elements := round(x / log(2)).
- vn = psimd_sub_f32(vn, vmagic_bias);
-
- // Compute reduced argument t := x - elements * log(2).
- // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
- psimd_f32 vt = psimd_qfma_f32(vx, vn, vminus_ln2_hi);
- vt = psimd_qfma_f32(vt, vn, vminus_ln2_lo);
-
- // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
- psimd_f32 vp = psimd_qfma_f32(vc4, vc5, vt);
- vp = psimd_qfma_f32(vc3, vp, vt);
- vp = psimd_qfma_f32(vc2, vp, vt);
- vp = psimd_qfma_f32(vc1, vp, vt);
-
- // Reconstruct the final f value:
- // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
- // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
- // = s + (t * s) * p
- vt = psimd_mul_f32(vt, vs);
- psimd_f32 vf = psimd_qfma_f32(vs, vt, vp);
-
- // For inputs below zero cutoff, replace output with +0.0f.
- // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
- vf = psimd_andnotmask_f32(vx < vdenorm_cutoff, vf);
-
- if (elements & (2 * sizeof(float))) {
- // Store 2 outputs at a time.
- psimd_store2_f32(output, vf);
- output += 2;
-
- // Accumulate 2 computed exponents.
- vacc = psimd_add_f32(vacc, psimd_concat_lo_f32(vf, psimd_zero_f32()));
-
- vf = psimd_concat_hi_f32(vf, vf);
- }
- if (elements & (1 * sizeof(float))) {
- // Store 1 output at a time.
- psimd_store1_f32(output, vf);
-
- // Accumulate 1 computed exponent.
- const psimd_f32 vzero = psimd_zero_f32();
- vf = psimd_concat_lo_f32(vf, vzero);
- vf = psimd_concat_even_f32(vf, vzero);
- vacc = psimd_add_f32(vacc, vf);
- }
- }
- // Reduce 4 elements in the SIMD register
- *sum = psimd_reduce_sum_f32(vacc);
-}
diff --git a/test/f32-raddstoreexpminusmax.cc b/test/f32-raddstoreexpminusmax.cc
index 4e7cf7f..0c0788c 100644
--- a/test/f32-raddstoreexpminusmax.cc
+++ b/test/f32-raddstoreexpminusmax.cc
@@ -3125,450 +3125,6 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X4, elements_eq_4) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(4)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X4, elements_div_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 8; elements < 40; elements += 4) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X4, elements_lt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 4; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X4, elements_gt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 5; elements < 8; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8, elements_eq_8) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(8)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8, elements_div_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 16; elements < 80; elements += 8) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8, elements_lt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 8; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8, elements_gt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 9; elements < 16; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8_ACC2, elements_eq_8) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(8)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8_ACC2, elements_div_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 16; elements < 80; elements += 8) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8_ACC2, elements_lt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 8; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X8_ACC2, elements_gt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 9; elements < 16; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12, elements_eq_12) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(12)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12, elements_div_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 24; elements < 120; elements += 12) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12, elements_lt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 12; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12, elements_gt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 13; elements < 24; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC2, elements_eq_12) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(12)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC2, elements_div_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 24; elements < 120; elements += 12) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC2, elements_lt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 12; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC2, elements_gt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 13; elements < 24; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC3, elements_eq_12) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(12)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC3, elements_div_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 24; elements < 120; elements += 12) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC3, elements_lt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 12; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X12_ACC3, elements_gt_12) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 13; elements < 24; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16, elements_eq_16) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(16)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16, elements_div_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 32; elements < 160; elements += 16) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16, elements_lt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 16; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16, elements_gt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 17; elements < 32; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC2, elements_eq_16) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(16)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC2, elements_div_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 32; elements < 160; elements += 16) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC2, elements_lt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 16; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC2, elements_gt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 17; elements < 32; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC4, elements_eq_16) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(16)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC4, elements_div_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 32; elements < 160; elements += 16) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC4, elements_lt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 16; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X16_ACC4, elements_gt_16) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 17; elements < 32; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20, elements_eq_20) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(20)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20, elements_div_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 40; elements < 200; elements += 20) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20, elements_lt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 20; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20, elements_gt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 21; elements < 40; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC2, elements_eq_20) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(20)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC2, elements_div_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 40; elements < 200; elements += 20) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC2, elements_lt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 20; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC2, elements_gt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 21; elements < 40; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC5, elements_eq_20) {
- TEST_REQUIRES_PSIMD;
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(20)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5);
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC5, elements_div_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 40; elements < 200; elements += 20) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC5, elements_lt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 1; elements < 20; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5);
- }
- }
-
- TEST(F32_RADDSTOREEXPMINUSMAX__PSIMD_P5_X20_ACC5, elements_gt_20) {
- TEST_REQUIRES_PSIMD;
- for (size_t elements = 21; elements < 40; elements++) {
- RAddStoreExpMinusMaxMicrokernelTester()
- .elements(elements)
- .Test(xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
#if XNN_ARCH_WASMSIMD
TEST(F32_RADDSTOREEXPMINUSMAX__WASMSIMD_P5_X4, elements_eq_4) {
RAddStoreExpMinusMaxMicrokernelTester()
diff --git a/test/f32-raddstoreexpminusmax.yaml b/test/f32-raddstoreexpminusmax.yaml
index 2c37030..358356a 100644
--- a/test/f32-raddstoreexpminusmax.yaml
+++ b/test/f32-raddstoreexpminusmax.yaml
@@ -86,18 +86,6 @@
- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2
- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3
- name: xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x4
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x8_acc2
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc2
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x12_acc3
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc4
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc2
-- name: xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x20_acc5
- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x4
- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x8
- name: xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x8_acc2