Process 7 input rows / 3 output rows at a time in 5x5p2 chw dwconv.  ~10% speedup.

PiperOrigin-RevId: 281362836
diff --git a/src/init.c b/src/init.c
index f63d7a8..dbcae34 100644
--- a/src/init.c
+++ b/src/init.c
@@ -521,7 +521,7 @@
         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma,
         .input_width_tile = 4,
         .output_width_tile = 4,
-        .output_height_tile = 2,
+        .output_height_tile = 3,
       };
       xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma,