Fix mismatch in block layout in mixed-layout Depth-To-Space operator

NCHW->NHWC Depth-To-Space operator assumed different layout of blocks within
input channels than NHWC Depth-To-Space operator. This caused dense prediction models to produce wrong results in sparse inference if they had more than one output channel.

PiperOrigin-RevId: 359797307
diff --git a/src/operator-run.c b/src/operator-run.c
index 1dc06fb..bf6b2bc 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -453,10 +453,7 @@
     context->block_size,
     (const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride),
     (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride),
-    context->input_channel_stride,
-    context->input_height_stride,
-    context->output_height_stride,
-    context->output_width_stride);
+    context->output_channel_stride);
 }
 
 void xnn_compute_argmax_pooling_unipass(
diff --git a/src/operators/depth-to-space-nchw2nhwc.c b/src/operators/depth-to-space-nchw2nhwc.c
index 5a51454..518f28d 100644
--- a/src/operators/depth-to-space-nchw2nhwc.c
+++ b/src/operators/depth-to-space-nchw2nhwc.c
@@ -138,10 +138,7 @@
     .output = output,
     .input_batch_stride = depth_to_space_op->input_pixel_stride * input_height * input_width * sizeof(float),
     .output_batch_stride = depth_to_space_op->output_pixel_stride * output_height * output_width * sizeof(float),
-    .input_channel_stride = input_height * input_width * sizeof(float),
-    .input_height_stride = input_width * sizeof(float),
-    .output_height_stride = depth_to_space_op->output_pixel_stride * output_width * sizeof(float),
-    .output_width_stride = depth_to_space_op->output_pixel_stride * sizeof(float),
+    .output_channel_stride = depth_to_space_op->output_pixel_stride,
     .ukernel = xnn_params.x32.depthtospace2d_chw2hwc.ukernel,
   };
 
diff --git a/src/x32-depthtospace2d-chw2hwc/scalar.c b/src/x32-depthtospace2d-chw2hwc/scalar.c
index 80e453b..2d1fa54 100644
--- a/src/x32-depthtospace2d-chw2hwc/scalar.c
+++ b/src/x32-depthtospace2d-chw2hwc/scalar.c
@@ -15,72 +15,23 @@
     size_t block_size,
     const uint32_t*restrict input,
     uint32_t*restrict output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride)
+    size_t output_channel_stride)
 {
   assert(output_channels != 0);
   assert(input_height != 0);
   assert(input_width != 0);
   assert(block_size != 0);
 
-  // output[(iy * block_size + by) * output_height_stride +
-  //        (ix * block_size + bx) * output_width_stride +
-  //        c * element_stride] =
-  //     input[
-  //         (c * block_size * block_size + by * block_size + bx) * input_channel_stride +
-  //         iy * input_height_stride +
-  //         ix * element_stride]
-
-  const size_t element_stride = sizeof(uint32_t);
-
-  const size_t iy_output_increment = block_size * output_height_stride;
-  const size_t by_output_increment = output_height_stride;
-  const size_t ix_output_increment = block_size * output_width_stride;
-  const size_t bx_output_increment = output_width_stride;
-  const size_t c_output_increment = element_stride;
-
-  const size_t c_input_increment = block_size * block_size * input_channel_stride;
-  const size_t by_input_increment = block_size * input_channel_stride;
-  const size_t bx_input_increment = input_channel_stride;
-  const size_t iy_input_increment = input_height_stride;
-  const size_t ix_input_increment = element_stride;
-
-  size_t iy = input_height;
-  uintptr_t i_iy = (uintptr_t) input;
-  uintptr_t o_iy = (uintptr_t) output;
-  do {
-    size_t by = block_size;
-    uintptr_t i_by = i_iy;
-    uintptr_t o_by = o_iy;
-    do {
-      size_t ix = input_width;
-      uintptr_t i_ix = i_by;
-      uintptr_t o_ix = o_by;
-      do {
-        size_t bx = block_size;
-        uintptr_t i_bx = i_ix;
-        uintptr_t o_bx = o_ix;
-        do {
-          size_t c = output_channels;
-          uintptr_t i_c = i_bx;
-          uintptr_t o_c = o_bx;
-          do {
-            *(uint32_t*) o_c = *(uint32_t*) i_c;
-            i_c += c_input_increment;
-            o_c += c_output_increment;
-          } while (--c != 0);
-          i_bx += bx_input_increment;
-          o_bx += bx_output_increment;
-        } while (--bx != 0);
-        i_ix += ix_input_increment;
-        o_ix += ix_output_increment;
-      } while (--ix != 0);
-      i_by += by_input_increment;
-      o_by += by_output_increment;
-    } while (--by != 0);
-    i_iy += iy_input_increment;
-    o_iy += iy_output_increment;
-  } while (--iy != 0);
+  for (size_t iy = 0; iy < input_height; iy++) {
+    for (size_t by = 0; by < block_size; by++) {
+      for (size_t ix = 0; ix < input_width; ix++) {
+        for (size_t bx = 0; bx < block_size; bx++) {
+          for (size_t oc = 0; oc < output_channels; oc++) {
+            output[(((iy * block_size + by) * input_width + ix) * block_size + bx) * output_channel_stride + oc] =
+              input[(((by * block_size + bx) * output_channels + oc) * input_height + iy) * input_width + ix];
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 9f198e7..e76d102 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -476,10 +476,7 @@
   void* output;
   size_t input_batch_stride;
   size_t output_batch_stride;
-  size_t input_channel_stride;
-  size_t input_height_stride;
-  size_t output_height_stride;
-  size_t output_width_stride;
+  size_t output_channel_stride;
   xnn_depthtospace2d_chw2hwc_ukernel_function ukernel;
 };
 
diff --git a/src/xnnpack/depthtospace.h b/src/xnnpack/depthtospace.h
index 32d63b2..285fd3c 100644
--- a/src/xnnpack/depthtospace.h
+++ b/src/xnnpack/depthtospace.h
@@ -23,10 +23,7 @@
       size_t block_size,                                             \
       const uint32_t* input,                                         \
       uint32_t* output,                                              \
-      size_t input_channel_stride,                                   \
-      size_t input_height_stride,                                    \
-      size_t output_height_stride,                                   \
-      size_t output_width_stride);
+      size_t output_channel_stride);
 
 DECLARE_X32_DEPTHTOSPACE2D_CHW2HWC_UKERNEL_FUNCTION(xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar)
 
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index a9e3c66..18fdf53 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -935,10 +935,7 @@
     size_t block_size,
     const void* input,
     void* output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride);
+    size_t output_channels_stride);
 
 typedef void (*xnn_x32_depthtospace2d_chw2hwc_ukernel_function)(
     size_t output_channels,
@@ -947,10 +944,7 @@
     size_t block_size,
     const uint32_t* input,
     uint32_t* output,
-    size_t input_channel_stride,
-    size_t input_height_stride,
-    size_t output_height_stride,
-    size_t output_width_stride);
+    size_t output_channel_stride);
 
 typedef void (*xnn_pad_ukernel_function)(
     size_t rows,
diff --git a/test/depth-to-space-operator-tester.h b/test/depth-to-space-operator-tester.h
index c189a65..8e854bb 100644
--- a/test/depth-to-space-operator-tester.h
+++ b/test/depth-to-space-operator-tester.h
@@ -240,7 +240,7 @@
                 for (size_t oc = 0; oc < output_channels(); oc++) {
                   const size_t input_index =
                     i * input_channels_stride() * input_height() * input_width() +
-                    (((oc * block_size() + by) * block_size() + bx) * input_height() + iy) * input_width() + ix;
+                    (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
                   const size_t output_index =
                     ((i * output_height() + iy * block_size() + by) * output_width() + ix * block_size() + bx) *
                       output_channels_stride() + oc;
diff --git a/test/depthtospace-microkernel-tester.h b/test/depthtospace-microkernel-tester.h
index 3984c66..44042ba 100644
--- a/test/depthtospace-microkernel-tester.h
+++ b/test/depthtospace-microkernel-tester.h
@@ -74,69 +74,17 @@
     return this->block_size_;
   }
 
-  inline DepthToSpaceMicrokernelTester& element_size(size_t element_size) {
-    assert(element_size != 0);
-    this->element_size_ = element_size;
+  inline DepthToSpaceMicrokernelTester& output_channel_stride(size_t output_channel_stride) {
+    assert(output_channel_stride != 0);
+    this->output_channel_stride_ = output_channel_stride;
     return *this;
   }
 
-  inline size_t element_size() const {
-    return this->element_size_;
-  }
-
-  inline DepthToSpaceMicrokernelTester& input_channel_stride(size_t input_channel_stride) {
-    assert(input_channel_stride != 0);
-    this->input_channel_stride_ = input_channel_stride;
-    return *this;
-  }
-
-  inline size_t input_channel_stride() const {
-    if (this->input_channel_stride_ != 0) {
-      return this->input_channel_stride_;
+  inline size_t output_channel_stride() const {
+    if (this->output_channel_stride_ != 0) {
+      return this->output_channel_stride_;
     } else {
-      return this->input_height() * this->input_width() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& input_height_stride(size_t input_height_stride) {
-    assert(input_height_stride != 0);
-    this->input_height_stride_ = input_height_stride;
-    return *this;
-  }
-
-  inline size_t input_height_stride() const {
-    if (this->input_height_stride_ != 0) {
-      return this->input_height_stride_;
-    } else {
-      return this->input_width() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& output_height_stride(size_t output_height_stride) {
-    assert(output_height_stride != 0);
-    this->output_height_stride_ = output_height_stride;
-    return *this;
-  }
-
-  inline size_t output_height_stride() const {
-    if (this->output_height_stride_ != 0) {
-      return this->output_height_stride_;
-    } else {
-      return this->output_width() * this->output_channels() * this->element_size();
-    }
-  }
-
-  inline DepthToSpaceMicrokernelTester& output_width_stride(size_t output_width_stride) {
-    assert(output_width_stride != 0);
-    this->output_width_stride_ = output_width_stride;
-    return *this;
-  }
-
-  inline size_t output_width_stride() const {
-    if (this->output_width_stride_ != 0) {
-      return this->output_width_stride_;
-    } else {
-      return this->output_channels() * this->element_size();
+      return this->output_channels();
     }
   }
 
@@ -150,30 +98,14 @@
   }
 
   void Test(xnn_x32_depthtospace2d_chw2hwc_ukernel_function depthtospace2d) const {
-    ASSERT_EQ(element_size(), sizeof(uint32_t));
     ASSERT_GE(block_size(), 2);
-    ASSERT_GE(input_channel_stride(), input_height() * input_height_stride());
-    ASSERT_GE(input_height_stride(), input_width() * element_size());
-    ASSERT_GE(output_height_stride(), input_width() * block_size() * output_width_stride());
-    ASSERT_GE(output_width_stride(), output_channels() * element_size());
 
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
     auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
 
-    const size_t input_byte_size =
-        (input_channels() - 1) * input_channel_stride() +
-        (input_height() - 1) * input_height_stride() +
-        input_width() * element_size();
-    ASSERT_EQ(input_byte_size % element_size(), 0);
-    std::vector<uint32_t> input(input_byte_size / element_size());
-
-    const size_t output_byte_size =
-        (output_height() - 1) * output_height_stride() +
-        (output_width() - 1) * output_width_stride() +
-        output_channels() * element_size();
-    ASSERT_EQ(output_byte_size % element_size(), 0);
-    std::vector<uint32_t> output(output_byte_size / element_size());
+    std::vector<uint32_t> input(input_channels() * input_height() * input_width());
+    std::vector<uint32_t> output((output_height() * output_width() - 1) * output_channel_stride() + output_channels());
 
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
       std::generate(input.begin(), input.end(), std::ref(u32rng));
@@ -186,36 +118,25 @@
         block_size(),
         input.data(),
         output.data(),
-        input_channel_stride(),
-        input_height_stride(),
-        output_height_stride(),
-        output_width_stride());
+        output_channel_stride());
 
       // Verify results.
-      for (size_t iy = 0; iy < input_height(); ++iy) {
-        for (size_t by = 0; by < block_size(); ++by) {
-          for (size_t ix = 0; ix < input_width(); ++ix) {
-            for (size_t bx = 0; bx < block_size(); ++bx) {
-              for (size_t c = 0; c < output_channels(); ++c) {
-                size_t input_offset =
-                    (c * block_size() * block_size() + by * block_size() + bx) * input_channel_stride() +
-                    iy * input_height_stride() +
-                    ix * element_size();
-                ASSERT_EQ(input_offset % element_size(), 0);
-                ASSERT_LT(input_offset / element_size(), input.size());
-
-                size_t output_offset =
-                    (iy * block_size() + by) * output_height_stride() +
-                    (ix * block_size() + bx) * output_width_stride() +
-                    c * element_size();
-                ASSERT_EQ(output_offset % element_size(), 0);
-                ASSERT_LT(output_offset / element_size(), output.size());
-
-                ASSERT_EQ(output[output_offset / element_size()],
-                          input[input_offset / element_size()])
-                    << "iy = " << iy << ", " << "by = " << by << ", "
-                    << "ix = " << ix << ", " << "bx = " << bx << ", "
-                    << "c = " << c;
+      for (size_t iy = 0; iy < input_height(); iy++) {
+        for (size_t by = 0; by < block_size(); by++) {
+          for (size_t ix = 0; ix < input_width(); ix++) {
+            for (size_t bx = 0; bx < block_size(); bx++) {
+              for (size_t oc = 0; oc < output_channels(); oc++) {
+                const size_t input_index =
+                  (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
+                const size_t output_index =
+                  ((iy * block_size() + by) * output_width() + ix * block_size() + bx) * output_channel_stride() + oc;
+                ASSERT_EQ(output[output_index], input[input_index])
+                  << "input x: " << ix << " / " << input_width()
+                  << ", input y: " << iy << " / " << input_height()
+                  << ", block x: " << bx << " / " << block_size()
+                  << ", block y: " << by << " / " << block_size()
+                  << ", output channel: " << oc << " / " << output_channels()
+                  << ", output stride: " << output_channel_stride();
               }
             }
           }
@@ -229,10 +150,6 @@
   size_t input_height_{1};
   size_t input_width_{1};
   size_t block_size_{2};
-  size_t element_size_{4};
-  size_t input_channel_stride_{0};
-  size_t input_height_stride_{0};
-  size_t output_height_stride_{0};
-  size_t output_width_stride_{0};
+  size_t output_channel_stride_{0};
   size_t iterations_{3};
 };