Avoid batch-replication of indirection buffer in DW Conv and Avg Pooling

- Indirection buffer is no longer proportional to batch size
- No need to reinitialize indirection buffer after change in batch size

PiperOrigin-RevId: 337318984
diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc
index 58c9e3d..836da0b 100644
--- a/bench/f16-dwconv.cc
+++ b/bench/f16-dwconv.cc
@@ -102,7 +102,6 @@
   convolution_op.input              = a.data();
   convolution_op.input_pixel_stride = channels;
   convolution_op.zero_buffer        = z.data();
-  convolution_op.batch_size         = 1;
   convolution_op.input_height       = input_height;
   convolution_op.input_width        = input_width;
   convolution_op.output_height      = output_height;
@@ -116,7 +115,7 @@
   convolution_op.padding_top        = padding_top;
   convolution_op.padding_left       = padding_left;
 
-  xnn_indirection_init_dwconv2d(&convolution_op, 0, step_height, step_width, 1 /* log2(sizeof(uint16_t)) */);
+  xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 1 /* log2(sizeof(uint16_t)) */);
   for (size_t n = 1; n < num_buffers; n++) {
     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
   }
@@ -134,7 +133,7 @@
     buffer_index = (buffer_index + 1) % num_buffers;
     state.ResumeTiming();
 
-    for (uint32_t y = 0; y < output_height; y++) {
+    for (size_t y = 0; y < output_height; y++) {
       dwconv(channels, output_width,
         reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
         w.data() + buffer_index * w_elements,