Accuracy evaluation stubs for LUT-based Sigmoid implementations
PiperOrigin-RevId: 282815193
diff --git a/src/math/sigmoid-neonfma-p5-nr2fma.c b/src/math/sigmoid-neonfma-p5-nr2fma.c
index 809a292..69d235f 100644
--- a/src/math/sigmoid-neonfma-p5-nr2fma.c
+++ b/src/math/sigmoid-neonfma-p5-nr2fma.c
@@ -43,8 +43,8 @@
// f[x] :=
// \ 1 - f[-x] if x >= 0
//
- // First we compute f[z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
- // then replace result with 1 - f[z] if x >= 0.
+ // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
+ // then replace result with 1 - f[-z] if x >= 0.
const float32x4_t vz = vabsq_f32(vx);
// Compute reduced argument n := round(-z / log(2)).
@@ -59,41 +59,42 @@
// -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
const float32x4_t vs = vreinterpretq_f32_s32(vshlq_n_s32(vreinterpretq_s32_f32(vn), 23));
- // Subtract the large number back to get final n := round(-z / log(2)).
+ // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
vn = vsubq_f32(vn, vmagic_bias);
- // Compute reduced argument -t := -z - n * log(2) = -(z + n * log(2)).
+ // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
// Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
float32x4_t vt = vfmaq_f32(vz, vn, vln2_hi);
vt = vfmaq_f32(vt, vn, vln2_lo);
- // Compute degree-5 polynomial approxiatmion for exp(-t) on [-log(2)/2, log(2)/2].
+ // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
+ // P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
float32x4_t vp = vfmaq_f32(vc4, vc5, vt);
vp = vfmaq_f32(vc3, vp, vt);
vp = vfmaq_f32(vc2, vp, vt);
vp = vfmaq_f32(vc1, vp, vt);
- // Reconstruct the exp(z) value:
+ // Reconstruct the exp(-z) value:
// e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
// = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
// = s + (t * s) * p
vt = vmulq_f32(vt, vs);
float32x4_t ve = vfmaq_f32(vs, vp, vt);
- // Denominator of the sigmoid fraction: 1.0 + exp(z)
+ // Denominator of the sigmoid fraction: 1.0 + exp(-z)
float32x4_t vd = vaddq_f32(ve, vone);
// Use Newton-Raphson method (2 iterations) to compute reciprocal of denominator.
- // Note: 1 < d <= 2, because z <= 0.0 and 0 < exp(z) <= 1.0.
+ // Note: 1 < d <= 2, because z >= 0.0 and 0 < exp(-z) <= 1.0.
// Thus the reciprocal of the denominator never overflows.
float32x4_t vr = vrecpeq_f32(vd);
vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
vr = vfmaq_f32(vr, vr, vfmsq_f32(vone, vr, vd));
- // Reconstruct sigmoid(-z) = exp(z) / (1.0 + exp(z))
+ // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
float32x4_t vf = vmulq_f32(ve, vr);
- // Reconstruct sigmoid(x) = x < 0 ? sigmoid(z) : 1.0 - sigmoid(z)
+ // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
const uint32x4_t vm = vcltq_s32(vreinterpretq_s32_f32(vx), vmovq_n_s32(0));
vf = vbslq_f32(vm, vf, vsubq_f32(vone, vf));