Fix bug in EXPM1MINUS and ELU implementation

Zero out both reduced arguments to guarantee saturation. Otherwise
multiplication by log(2) can overflow resulting in NaN outputs.

PiperOrigin-RevId: 347470642
diff --git a/src/math/expm1minus-scalar-rr2-lut16-p4.c b/src/math/expm1minus-scalar-rr2-lut16-p4.c
index 474a851..8a085bb 100644
--- a/src/math/expm1minus-scalar-rr2-lut16-p4.c
+++ b/src/math/expm1minus-scalar-rr2-lut16-p4.c
@@ -70,17 +70,18 @@
     // Subtract the large number back to get final n := round(x / log(2), 4).
     vn -= vmagic_bias;
 
-    // The function saturates at -1 for large negative inputs: expm1f(x) == -1.0f for x <= sat_cutoff ~= -17.328680.
-    // To guarantee this behaviour, we zero out s (scale) for x <= sat_cutoff.
-    if XNN_UNPREDICTABLE(vx <= vsat_cutoff) {
-      vs = 0.0f;
-    }
-
     // Compute reduced argument t := x - n * log(2).
     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
     float vt = vn * vminus_ln2_hi + vx;
     vt = vn * vminus_ln2_lo + vt;
 
+    // The function saturates at -1 for large negative inputs: expm1f(x) == -1.0f for x <= sat_cutoff ~= -17.328680.
+    // To guarantee this behaviour, we zero out s (scale) and t (reduced argument) for x <= sat_cutoff.
+    if XNN_UNPREDICTABLE(vx <= vsat_cutoff) {
+      vs = 0.0f;
+      vt = 0.0f;
+    }
+
     // Compute degree-4 polynomial approximation for exp(t) - 1 on [-log(2)/32, log(2)/32].
     //   P(t) = t * (1 + t * (c2 + t * (c3 + t * c4))) = t + t * (t * (c2 + t * (c3 + t * c4))) = t + t * p
     float vp = vc4 * vt + vc3;