Refactor and open-source vectorized expminus function

- AVX2 version with degree-5 polynomial approximation

PiperOrigin-RevId: 275378607
diff --git a/BUILD b/BUILD
index 80c0d9e..49d8dba 100644
--- a/BUILD
+++ b/BUILD
@@ -410,6 +410,7 @@
     "src/math/exp-avx2-p5.c",
     "src/math/exp-avx2-perm-p3.c",
     "src/math/exp-avx2-perm-p4.c",
+    "src/math/expminus-avx2-p5.c",
 ]
 
 AVX512F_UKERNELS = [
@@ -1044,6 +1045,15 @@
     deps = ACCURACY_EVAL_DEPS,
 )
 
+xnnpack_benchmark(
+    name = "f32_expminus_eval",
+    srcs = [
+        "eval/f32-expminus.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + ACCURACY_EVAL_HDRS,
+    deps = ACCURACY_EVAL_DEPS,
+)
+
 ######################### Unit tests for micro-kernels #########################
 
 xnnpack_unit_test(
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ea59b6..480380a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -488,7 +488,8 @@
 SET(XNNPACK_AVX2_MICROKERNEL_SRCS
   src/math/exp-avx2-p5.c
   src/math/exp-avx2-perm-p3.c
-  src/math/exp-avx2-perm-p4.c)
+  src/math/exp-avx2-perm-p4.c
+  src/math/expminus-avx2-p5.c)
 
 SET(XNNPACK_AVX512F_MICROKERNEL_SRCS
   src/f32-rmax/avx512f.c
diff --git a/eval/f32-expminus.cc b/eval/f32-expminus.cc
new file mode 100644
index 0000000..f6e83aa
--- /dev/null
+++ b/eval/f32-expminus.cc
@@ -0,0 +1,64 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+static void ExpError(benchmark::State& state,
+  xnn_f32_unary_math_function exp,
+  size_t tile_size)
+{
+  // The smallest x for which expf(x) is normalized (-0x1.5D589Ep6f).
+  const uint32_t min_input = 0xC2AEAC4FL;
+  const size_t num_tiles = 100;
+
+  double max_ulp_error = 0.0;
+  std::vector<float, AlignedAllocator<float, 64>> x(tile_size * num_tiles);
+  std::vector<float, AlignedAllocator<float, 64>> y(tile_size * num_tiles);
+  for (auto _ : state) {
+    for (uint32_t n = min_input; int32_t(n) < 0; n -= tile_size * num_tiles) {
+      for (uint32_t i = 0; i < tile_size * num_tiles; i++) {
+        x[i] = fp32_from_bits(std::max<uint32_t>(n - i, 0x80000000));
+      }
+      std::fill(y.begin(), y.end(), std::nanf(""));
+
+      exp(tile_size * num_tiles * sizeof(float), x.data(), y.data());
+
+      for (uint32_t i = 0; i < tile_size * num_tiles; i++) {
+        const double y_ref = std::exp(double(x[i]));
+        const double abs_error = std::abs(y_ref - double(y[i]));
+        const float y_abs = std::abs(y_ref);
+        const float y_ulp = fp32_from_bits(fp32_to_bits(y_abs) + 1) - y_abs;
+        max_ulp_error = std::max<double>(max_ulp_error, abs_error / y_ulp);
+      }
+    }
+  }
+
+  state.counters["ULPERROR"] = benchmark::Counter(max_ulp_error);
+}
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  static void f32_expminus__avx2_p5(benchmark::State& state) {
+    ExpError(state, xnn_math_f32_expminus__avx2_p5, 8);
+  }
+
+  BENCHMARK(f32_expminus__avx2_p5)->Unit(benchmark::kMillisecond)->Iterations(1);
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/src/math/expminus-avx2-p5.c b/src/math/expminus-avx2-p5.c
new file mode 100644
index 0000000..8c05304
--- /dev/null
+++ b/src/math/expminus-avx2-p5.c
@@ -0,0 +1,78 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_expminus__avx2_p5(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (8 * sizeof(float)) == 0);
+
+  const __m256 vmagic_bias = _mm256_set1_ps(0x1.8000FEp23f);
+  // The smallest x for which expf(x) is normalized.
+  const __m256 vdenorm_cutoff = _mm256_set1_ps(-0x1.5D589Ep6f);
+  const __m256 vlog2e = _mm256_set1_ps(0x1.715476p+0f);
+  const __m256 vminus_ln2_hi = _mm256_set1_ps(-0x1.62E43p-1f);
+  const __m256 vminus_ln2_lo = _mm256_set1_ps(0x1.05C61p-29f);
+
+  const __m256 vc1 = _mm256_set1_ps(0x1.FFFFF6p-1f);
+  const __m256 vc2 = _mm256_set1_ps(0x1.FFFDC6p-2f);
+  const __m256 vc3 = _mm256_set1_ps(0x1.555A80p-3f);
+  const __m256 vc4 = _mm256_set1_ps(0x1.573A1Ap-5f);
+  const __m256 vc5 = _mm256_set1_ps(0x1.0F9F9Cp-7f);
+
+  for (; n != 0; n -= 8 * sizeof(float)) {
+    const __m256 vx = _mm256_loadu_ps(input);
+
+    // Compute reduced argument n := round(x / log(2)).
+    // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the 
+    // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
+    // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
+    // inputs outside of [-103.97207, 0.0] underflow expf(x) anyway. We fixup the result for such inputs at the very end
+    // of the algorithm.
+    __m256 vn = _mm256_fmadd_ps(vx, vlog2e, vmagic_bias);
+
+    // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
+    // -87.33642 <= x <= 0.0, and -126 <= n <= 0 accordingly.
+    const __m256 vs = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(vn), 23));
+
+    // Subtract the large number back to get final n := round(x / log(2)).
+    vn = _mm256_sub_ps(vn, vmagic_bias);
+
+    // Compute reduced argument t := x - n * log(2).
+    // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
+    __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2_hi, vx);
+    vt = _mm256_fmadd_ps(vn, vminus_ln2_lo, vt);
+
+    // Compute degree-5 polynomial approxiatmion for exp(t) on [-log(2)/2, log(2)/2].
+    __m256 vp = _mm256_fmadd_ps(vc5, vt, vc4);
+    vp = _mm256_fmadd_ps(vp, vt, vc3);
+    vp = _mm256_fmadd_ps(vp, vt, vc2);
+    vp = _mm256_fmadd_ps(vp, vt, vc1);
+
+    // Reconstruct the final f value:
+    //   f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
+    //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
+    //     = s + (t * s) * p
+    vt = _mm256_mul_ps(vt, vs);
+    __m256 vf = _mm256_fmadd_ps(vt, vp, vs);
+
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = _mm256_andnot_ps(_mm256_cmp_ps(vx, vdenorm_cutoff, _CMP_LT_OS), vf);
+    _mm256_storeu_ps(output, vf);
+
+    input += 8;
+    output += 8;
+  }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index 22f9fad..4154a0e 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -30,6 +30,8 @@
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_p5_scalef)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__avx512f_perm_p3)
 
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_expminus__avx2_p5)
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif