Use 1-step range reduction in NEONFMA Sigmoid micro-kernels
- Parametrize F32 Sigmoid micro-kernels by the number of range reduction steps
- Use 1-step range reduction in NEONFMA implementations (~5% performance
improvement on Pixel 2 & 3a, ~6% on Mi A2 Lite)
PiperOrigin-RevId: 288364064
diff --git a/src/init.c b/src/init.c
index 85872be..f0bf70f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -242,7 +242,7 @@
};
xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x8;
- xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_lut64_p2_nr2recps_x8;
+ xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8;
xnn_params.f32.prelu = (struct prelu_parameters) {
.ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
.row_tile = 2,
@@ -546,7 +546,7 @@
};
xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma_x8;
- xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neonfma_lut64_p2_nr2recps_x16;
+ xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16;
xnn_params.f32.prelu = (struct prelu_parameters) {
.ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
.row_tile = 2,