Additional Sigmoid micro-kernels and accuracy evaluation stub

- PSIMD micro-kernels and accuracy evaluation stubs
- ARM NEON micro-kernels using 2048-entry table lookups
- ARM NEON micro-kernels with alternative division implementations
- ARM NEON micro-kernels without FMA
- x4..x24 version of all SIMD micro-kernels
- Eliminated comparison with one_cutoff & corresponding blend in all
  micro-kernels

PiperOrigin-RevId: 287804583
diff --git a/src/math/sigmoid-neonfma-p5-nr2recps.c b/src/math/sigmoid-neonfma-p5-nr2recps.c
index fdb219f..578da6f 100644
--- a/src/math/sigmoid-neonfma-p5-nr2recps.c
+++ b/src/math/sigmoid-neonfma-p5-nr2recps.c
@@ -19,11 +19,9 @@
   assert(n % (4 * sizeof(float)) == 0);
 
   const float32x4_t vmagic_bias = vmovq_n_f32(0x1.8000FEp23f);
-  // The smallest x for which sigmoidf(x) is normalized.
-  // This number is also the smallest x for which expf(x) is normalized.
+  // The largest z for which sigmoidf(-z) is normalized.
+  // This number is also the largest z for which expf(-z) is normalized.
   const float32x4_t vdenorm_cutoff = vmovq_n_f32(-0x1.5D589Ep+6f);
-  // The largest x for which sigmoidf(x) is not equal 1.0.
-  const float32x4_t vone_cutoff = vmovq_n_f32(0x1.154244p+4f);
   const float32x4_t vminus_log2e = vmovq_n_f32(-0x1.715476p+0f);
   const float32x4_t vln2_hi = vmovq_n_f32(0x1.62E43p-1f);
   const float32x4_t vln2_lo = vmovq_n_f32(-0x1.05C61p-29f);
@@ -94,18 +92,14 @@
     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
     float32x4_t vf = vmulq_f32(ve, vr);
 
+    // For inputs below denormal cutoff, replace output with +0.0f.
+    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
+    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcagtq_f32(vx, vdenorm_cutoff)));
+
     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
     const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
     vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));
 
-    // For inputs above 1.0 cutoff, replace output with 1.0.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vbslq_f32(vcgtq_f32(vx, vone_cutoff), vone, vf);
-
-    // For inputs below denormal cutoff, replace output with +0.0f.
-    // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
-    vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vcltq_f32(vx, vdenorm_cutoff)));
-
     vst1q_f32(output, vf); output += 4;
   }
 }