Process 7 input rows / 3 output rows at a time in 5x5p2 chw dwconv. ~10% speedup.
PiperOrigin-RevId: 281362836
diff --git a/src/init.c b/src/init.c
index f63d7a8..dbcae34 100644
--- a/src/init.c
+++ b/src/init.c
@@ -521,7 +521,7 @@
.ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma,
.input_width_tile = 4,
.output_width_tile = 4,
- .output_height_tile = 2,
+ .output_height_tile = 3,
};
xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
.ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma,