Fix incorrect indirection size computation for DWCONV

Slightly reduce memory footprint for depthwise convolutions

PiperOrigin-RevId: 283990702
diff --git a/src/convolution-nhwc.c b/src/convolution-nhwc.c
index f1cc5ae..c3e6e91 100644
--- a/src/convolution-nhwc.c
+++ b/src/convolution-nhwc.c
@@ -967,7 +967,7 @@
       const size_t output_height = convolution_op->output_height;
       const size_t output_width = convolution_op->output_width;
       const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
-      const size_t step_height = kernel_size + (output_width * step_width - 1) * kernel_height;
+      const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
       const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height;
 
       const void** indirection_buffer =