Use 1-step range reduction in NEONFMA Sigmoid micro-kernels

- Parametrize F32 Sigmoid micro-kernels by the number of range reduction steps
- Use 1-step range reduction in NEONFMA implementations (~5% performance
  improvement on Pixel 2 & 3a, ~6% on Mi A2 Lite)

PiperOrigin-RevId: 288364064
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54d1172..c39a2b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -496,24 +496,24 @@
   src/f32-prelu/gen/neon-2x8.c
   src/f32-rmax/neon.c
   src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c
-  src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x4.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x8.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x12.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x16.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x20.c
-  src/f32-sigmoid/gen/neon-lut64-p2-nr2recps-x24.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c
-  src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x4.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x8.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x12.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x16.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x20.c
+  src/f32-sigmoid/gen/neon-rr2-p5-nr2recps-x24.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x4.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x8.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x12.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x16.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x20.c
+  src/f32-sigmoid/gen/neon-rr2-lut64-p2-nr2recps-x24.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x4.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x8.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x12.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x16.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x20.c
+  src/f32-sigmoid/gen/neon-rr2-lut2048-p1-nr2recps-x24.c
   src/f32-vbinary/gen/vadd-neon-x4.c
   src/f32-vbinary/gen/vadd-neon-x8.c
   src/f32-vbinary/gen/vaddc-neon-x4.c
@@ -607,60 +607,60 @@
   src/f32-hswish/gen/neonfma-x8.c
   src/f32-ppmm/gen/4x8-neonfma.c
   src/f32-ppmm/gen/8x8-neonfma.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c
-  src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c
-  src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x4.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x8.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x12.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x16.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x20.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2fma-x24.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x4.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x8.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x12.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x16.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x20.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr1recps1fma-x24.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x4.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x8.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x12.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x16.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x20.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-nr2recps-x24.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr1recps1fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-nr2recps-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr1recps1fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-nr2recps-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr1recps1fma-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-nr2recps-x24.c
   src/f32-vmulcaddc/gen/c4-neonfma-2x.c
   src/f32-vmulcaddc/gen/c8-neonfma-2x.c
   src/math/exp-neonfma-lut64-p2.c
@@ -721,24 +721,24 @@
   src/f32-dwconv-spchw/5x5p2-neonfma.c
   src/f32-dwconv-spchw/3x3s2p1-neonfma.c
   src/f32-dwconv-spchw/5x5s2p2-neonfma.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x4.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x8.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x12.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x16.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x20.c
-  src/f32-sigmoid/gen/neonfma-p5-div-x24.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x4.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x8.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x12.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x16.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x20.c
-  src/f32-sigmoid/gen/neonfma-lut64-p2-div-x24.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c
-  src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-p5-div-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut64-p2-div-x24.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x4.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x8.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x12.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x16.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x20.c
+  src/f32-sigmoid/gen/neonfma-rr1-lut2048-p1-div-x24.c
   src/f32-spmm/gen/12x1-neonfma.c
   src/f32-spmm/gen/12x2-neonfma.c
   src/f32-spmm/gen/12x4-neonfma.c