Refactor and open-source vectorized expminus function
- AVX2 version with degree-5 polynomial approximation
PiperOrigin-RevId: 275378607
diff --git a/BUILD b/BUILD
index 80c0d9e..49d8dba 100644
--- a/BUILD
+++ b/BUILD
@@ -410,6 +410,7 @@
"src/math/exp-avx2-p5.c",
"src/math/exp-avx2-perm-p3.c",
"src/math/exp-avx2-perm-p4.c",
+ "src/math/expminus-avx2-p5.c",
]
AVX512F_UKERNELS = [
@@ -1044,6 +1045,15 @@
deps = ACCURACY_EVAL_DEPS,
)
+xnnpack_benchmark(
+ name = "f32_expminus_eval",
+ srcs = [
+ "eval/f32-expminus.cc",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + ACCURACY_EVAL_HDRS,
+ deps = ACCURACY_EVAL_DEPS,
+)
+
######################### Unit tests for micro-kernels #########################
xnnpack_unit_test(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ea59b6..480380a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -488,7 +488,8 @@
SET(XNNPACK_AVX2_MICROKERNEL_SRCS
src/math/exp-avx2-p5.c
src/math/exp-avx2-perm-p3.c
- src/math/exp-avx2-perm-p4.c)
+ src/math/exp-avx2-perm-p4.c
+ src/math/expminus-avx2-p5.c)
SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
src/f32-rmax/avx512f.c
diff --git a/eval/f32-expminus.cc b/eval/f32-expminus.cc
new file mode 100644
index 0000000..f6e83aa
--- /dev/null
+++ b/eval/f32-expminus.cc
@@ -0,0 +1,64 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+static void ExpError(benchmark::State& state,
+ xnn_f32_unary_math_function exp,
+ size_t tile_size)
+{
+ // The smallest x for which expf(x) is normalized (-0x1.5D589Ep6f).
+ const uint32_t min_input = 0xC2AEAC4FL;
+ const size_t num_tiles = 100;
+
+ double max_ulp_error = 0.0;
+ std::vector<float, AlignedAllocator<float, 64>> x(tile_size * num_tiles);
+ std::vector<float, AlignedAllocator<float, 64>> y(tile_size * num_tiles);
+ for (auto _ : state) {
+ for (uint32_t n = min_input; int32_t(n) < 0; n -= tile_size * num_tiles) {
+ for (uint32_t i = 0; i < tile_size * num_tiles; i++) {
+ x[i] = fp32_from_bits(std::max<uint32_t>(n - i, 0x80000000));
+ }
+ std::fill(y.begin(), y.end(), std::nanf(""));
+
+ exp(tile_size * num_tiles * sizeof(float), x.data(), y.data());
+
+ for (uint32_t i = 0; i < tile_size * num_tiles; i++) {
+ const double y_ref = std::exp(double(x[i]));
+ const double abs_error = std::abs(y_ref - double(y[i]));
+ const float y_abs = std::abs(y_ref);
+ const float y_ulp = fp32_from_bits(fp32_to_bits(y_abs) + 1) - y_abs;
+ max_ulp_error = std::max<double>(max_ulp_error, abs_error / y_ulp);
+ }
+ }
+ }
+
+ state.counters["ULPERROR"] = benchmark::Counter(max_ulp_error);
+}
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ static void f32_expminus__avx2_p5(benchmark::State& state) {
+ ExpError(state, xnn_math_f32_expminus__avx2_p5, 8);
+ }
+
+ BENCHMARK(f32_expminus__avx2_p5)->Unit(benchmark::kMillisecond)->Iterations(1);
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/src/math/expminus-avx2-p5.c b/src/math/expminus-avx2-p5.c
new file mode 100644
index 0000000..8c05304
--- /dev/null
+++ b/src/math/expminus-avx2-p5.c
@@ -0,0 +1,78 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_expminus__avx2_p5(
+ size_t n,
+ const float* input,
+ float* output)
+{
+ assert(n % (8 * sizeof(float)) == 0);
+
+ const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f);
+ // The smallest x for which expf(x) is normalized.
+ const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f);
+ const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f);
+ const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E43p-1f);
+ const __m256 vminus_ln2_lo = _mm256_set1_ps(0x1.05C61p-29f);
+
+ const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f);
+ const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f);
+ const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f);
+ const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f);
+ const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f);
+
+ for (; n != 0; n -= 8 * sizeof(float)) {
+ const __m256 vx = _mm256_loadu_ps(input);
+
+ // Compute reduced argument n := round(x / log(2)).
+ // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
+ // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+ // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+ // inputs outside of [-103.97207, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
+ // of the algorithm.
+ __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
+
+ // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+ // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
+ const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
+
+ // Subtract the large number back to get final n := round(x / log(2)).
+ vn = _mm256_sub_ps(vn, vmagic_bias);
+
+ // Compute reduced argument t := x - n * log(2).
+ // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+ __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2_hi, vx);
+ vt = _mm256_fmadd_ps(vn, vminus_ln2_lo, vt);
+
+ // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+ __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
+ vp = _mm256_fmadd_ps(vp, vt, vc3);
+ vp = _mm256_fmadd_ps(vp, vt, vc2);
+ vp = _mm256_fmadd_ps(vp, vt, vc1);
+
+ // Reconstruct the final f value:
+ // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+ // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+ // = s + (t * s) * p
+ vt = _mm256_mul_ps(vt, vs);
+ __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
+
+ // For inputs below denormal cutoff, replace output with +0.0f.
+ // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+ vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
+ _mm256_storeu_ps(output, vf);
+
+ input += 8;
+ output += 8;
+ }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index 22f9fad..4154a0e 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -30,6 +30,8 @@
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5_scalef)
DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_perm_p3)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__avx2_p5)
+
#ifdef __cplusplus
} /* extern "C" */
#endif