Additional Sigmoid micro-kernels and accuracy evaluation stub - PSIMD micro-kernels and accuracy evaluation stubs - ARM NEON micro-kernels using 2048-entry table lookups - ARM NEON micro-kernels with alternative division implementations - ARM NEON micro-kernels without FMA - x4..x24 version of all SIMD micro-kernels - Eliminated comparison with one_cutoff & corresponding blend in all micro-kernels PiperOrigin-RevId: 287804583

commit: 8d3c07e03a55862847e0a6a90f6f9177e87dba4a [log] [tgz]
author: Marat Dukhan <maratek@google.com> Thu Jan 02 01:20:59 2020 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Thu Jan 02 01:21:29 2020 -0800
tree: e328046667bbb59bdd88ce320abcf29f8857cc9a
parent: 279908a1af406a1973069979906d9fae569719fa [diff] [blame]
diff --git a/src/math/sigmoid-neonfma-p5-nr2recps.c b/src/math/sigmoid-neonfma-p5-nr2recps.c
index fdb219f..578da6f 100644
--- a/src/math/sigmoid-neonfma-p5-nr2recps.c
+++ b/src/math/sigmoid-neonfma-p5-nr2recps.c

@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -94,18 +92,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(ve, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }
commit	8d3c07e03a55862847e0a6a90f6f9177e87dba4a	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Thu Jan 02 01:20:59 2020 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Thu Jan 02 01:21:29 2020 -0800
tree	e328046667bbb59bdd88ce320abcf29f8857cc9a
parent	279908a1af406a1973069979906d9fae569719fa [diff] [blame]