Additional Sigmoid micro-kernels and accuracy evaluation stub

- PSIMD micro-kernels and accuracy evaluation stubs
- ARM NEON micro-kernels using 2048-entry table lookups
- ARM NEON micro-kernels with alternative division implementations
- ARM NEON micro-kernels without FMA
- x4..x24 version of all SIMD micro-kernels
- Eliminated comparison with one_cutoff & corresponding blend in all
  micro-kernels

PiperOrigin-RevId: 287804583
diff --git a/src/init.c b/src/init.c
index c01d684..bee5fd4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1207,6 +1207,7 @@
     };
     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd;
     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__psimd_x8;
+    xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__psimd_p5_div_x16;
     xnn_params.f32.prelu = (struct prelu_parameters) {
       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__psimd_2x8,
       .row_tile = 2,