Fix mismatch in block layout in mixed-layout Depth-To-Space operator
NCHW->NHWC Depth-To-Space operator assumed different layout of blocks within
input channels than NHWC Depth-To-Space operator. This caused dense prediction models to produce wrong results in sparse inference if they had more than one output channel.
PiperOrigin-RevId: 359797307
diff --git a/src/operator-run.c b/src/operator-run.c
index 1dc06fb..bf6b2bc 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -453,10 +453,7 @@
context->block_size,
(const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride),
(void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride),
- context->input_channel_stride,
- context->input_height_stride,
- context->output_height_stride,
- context->output_width_stride);
+ context->output_channel_stride);
}
void xnn_compute_argmax_pooling_unipass(
diff --git a/src/operators/depth-to-space-nchw2nhwc.c b/src/operators/depth-to-space-nchw2nhwc.c
index 5a51454..518f28d 100644
--- a/src/operators/depth-to-space-nchw2nhwc.c
+++ b/src/operators/depth-to-space-nchw2nhwc.c
@@ -138,10 +138,7 @@
.output = output,
.input_batch_stride = depth_to_space_op->input_pixel_stride * input_height * input_width * sizeof(float),
.output_batch_stride = depth_to_space_op->output_pixel_stride * output_height * output_width * sizeof(float),
- .input_channel_stride = input_height * input_width * sizeof(float),
- .input_height_stride = input_width * sizeof(float),
- .output_height_stride = depth_to_space_op->output_pixel_stride * output_width * sizeof(float),
- .output_width_stride = depth_to_space_op->output_pixel_stride * sizeof(float),
+ .output_channel_stride = depth_to_space_op->output_pixel_stride,
.ukernel = xnn_params.x32.depthtospace2d_chw2hwc.ukernel,
};
diff --git a/src/x32-depthtospace2d-chw2hwc/scalar.c b/src/x32-depthtospace2d-chw2hwc/scalar.c
index 80e453b..2d1fa54 100644
--- a/src/x32-depthtospace2d-chw2hwc/scalar.c
+++ b/src/x32-depthtospace2d-chw2hwc/scalar.c
@@ -15,72 +15,23 @@
size_t block_size,
const uint32_t*restrict input,
uint32_t*restrict output,
- size_t input_channel_stride,
- size_t input_height_stride,
- size_t output_height_stride,
- size_t output_width_stride)
+ size_t output_channel_stride)
{
assert(output_channels != 0);
assert(input_height != 0);
assert(input_width != 0);
assert(block_size != 0);
- // output[(iy * block_size + by) * output_height_stride +
- // (ix * block_size + bx) * output_width_stride +
- // c * element_stride] =
- // input[
- // (c * block_size * block_size + by * block_size + bx) * input_channel_stride +
- // iy * input_height_stride +
- // ix * element_stride]
-
- const size_t element_stride = sizeof(uint32_t);
-
- const size_t iy_output_increment = block_size * output_height_stride;
- const size_t by_output_increment = output_height_stride;
- const size_t ix_output_increment = block_size * output_width_stride;
- const size_t bx_output_increment = output_width_stride;
- const size_t c_output_increment = element_stride;
-
- const size_t c_input_increment = block_size * block_size * input_channel_stride;
- const size_t by_input_increment = block_size * input_channel_stride;
- const size_t bx_input_increment = input_channel_stride;
- const size_t iy_input_increment = input_height_stride;
- const size_t ix_input_increment = element_stride;
-
- size_t iy = input_height;
- uintptr_t i_iy = (uintptr_t) input;
- uintptr_t o_iy = (uintptr_t) output;
- do {
- size_t by = block_size;
- uintptr_t i_by = i_iy;
- uintptr_t o_by = o_iy;
- do {
- size_t ix = input_width;
- uintptr_t i_ix = i_by;
- uintptr_t o_ix = o_by;
- do {
- size_t bx = block_size;
- uintptr_t i_bx = i_ix;
- uintptr_t o_bx = o_ix;
- do {
- size_t c = output_channels;
- uintptr_t i_c = i_bx;
- uintptr_t o_c = o_bx;
- do {
- *(uint32_t*) o_c = *(uint32_t*) i_c;
- i_c += c_input_increment;
- o_c += c_output_increment;
- } while (--c != 0);
- i_bx += bx_input_increment;
- o_bx += bx_output_increment;
- } while (--bx != 0);
- i_ix += ix_input_increment;
- o_ix += ix_output_increment;
- } while (--ix != 0);
- i_by += by_input_increment;
- o_by += by_output_increment;
- } while (--by != 0);
- i_iy += iy_input_increment;
- o_iy += iy_output_increment;
- } while (--iy != 0);
+ for (size_t iy = 0; iy < input_height; iy++) {
+ for (size_t by = 0; by < block_size; by++) {
+ for (size_t ix = 0; ix < input_width; ix++) {
+ for (size_t bx = 0; bx < block_size; bx++) {
+ for (size_t oc = 0; oc < output_channels; oc++) {
+ output[(((iy * block_size + by) * input_width + ix) * block_size + bx) * output_channel_stride + oc] =
+ input[(((by * block_size + bx) * output_channels + oc) * input_height + iy) * input_width + ix];
+ }
+ }
+ }
+ }
+ }
}
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 9f198e7..e76d102 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -476,10 +476,7 @@
void* output;
size_t input_batch_stride;
size_t output_batch_stride;
- size_t input_channel_stride;
- size_t input_height_stride;
- size_t output_height_stride;
- size_t output_width_stride;
+ size_t output_channel_stride;
xnn_depthtospace2d_chw2hwc_ukernel_function ukernel;
};
diff --git a/src/xnnpack/depthtospace.h b/src/xnnpack/depthtospace.h
index 32d63b2..285fd3c 100644
--- a/src/xnnpack/depthtospace.h
+++ b/src/xnnpack/depthtospace.h
@@ -23,10 +23,7 @@
size_t block_size, \
const uint32_t* input, \
uint32_t* output, \
- size_t input_channel_stride, \
- size_t input_height_stride, \
- size_t output_height_stride, \
- size_t output_width_stride);
+ size_t output_channel_stride);
DECLARE_X32_DEPTHTOSPACE2D_CHW2HWC_UKERNEL_FUNCTION(xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar)
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index a9e3c66..18fdf53 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -935,10 +935,7 @@
size_t block_size,
const void* input,
void* output,
- size_t input_channel_stride,
- size_t input_height_stride,
- size_t output_height_stride,
- size_t output_width_stride);
+ size_t output_channels_stride);
typedef void (*xnn_x32_depthtospace2d_chw2hwc_ukernel_function)(
size_t output_channels,
@@ -947,10 +944,7 @@
size_t block_size,
const uint32_t* input,
uint32_t* output,
- size_t input_channel_stride,
- size_t input_height_stride,
- size_t output_height_stride,
- size_t output_width_stride);
+ size_t output_channel_stride);
typedef void (*xnn_pad_ukernel_function)(
size_t rows,
diff --git a/test/depth-to-space-operator-tester.h b/test/depth-to-space-operator-tester.h
index c189a65..8e854bb 100644
--- a/test/depth-to-space-operator-tester.h
+++ b/test/depth-to-space-operator-tester.h
@@ -240,7 +240,7 @@
for (size_t oc = 0; oc < output_channels(); oc++) {
const size_t input_index =
i * input_channels_stride() * input_height() * input_width() +
- (((oc * block_size() + by) * block_size() + bx) * input_height() + iy) * input_width() + ix;
+ (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
const size_t output_index =
((i * output_height() + iy * block_size() + by) * output_width() + ix * block_size() + bx) *
output_channels_stride() + oc;
diff --git a/test/depthtospace-microkernel-tester.h b/test/depthtospace-microkernel-tester.h
index 3984c66..44042ba 100644
--- a/test/depthtospace-microkernel-tester.h
+++ b/test/depthtospace-microkernel-tester.h
@@ -74,69 +74,17 @@
return this->block_size_;
}
- inline DepthToSpaceMicrokernelTester& element_size(size_t element_size) {
- assert(element_size != 0);
- this->element_size_ = element_size;
+ inline DepthToSpaceMicrokernelTester& output_channel_stride(size_t output_channel_stride) {
+ assert(output_channel_stride != 0);
+ this->output_channel_stride_ = output_channel_stride;
return *this;
}
- inline size_t element_size() const {
- return this->element_size_;
- }
-
- inline DepthToSpaceMicrokernelTester& input_channel_stride(size_t input_channel_stride) {
- assert(input_channel_stride != 0);
- this->input_channel_stride_ = input_channel_stride;
- return *this;
- }
-
- inline size_t input_channel_stride() const {
- if (this->input_channel_stride_ != 0) {
- return this->input_channel_stride_;
+ inline size_t output_channel_stride() const {
+ if (this->output_channel_stride_ != 0) {
+ return this->output_channel_stride_;
} else {
- return this->input_height() * this->input_width() * this->element_size();
- }
- }
-
- inline DepthToSpaceMicrokernelTester& input_height_stride(size_t input_height_stride) {
- assert(input_height_stride != 0);
- this->input_height_stride_ = input_height_stride;
- return *this;
- }
-
- inline size_t input_height_stride() const {
- if (this->input_height_stride_ != 0) {
- return this->input_height_stride_;
- } else {
- return this->input_width() * this->element_size();
- }
- }
-
- inline DepthToSpaceMicrokernelTester& output_height_stride(size_t output_height_stride) {
- assert(output_height_stride != 0);
- this->output_height_stride_ = output_height_stride;
- return *this;
- }
-
- inline size_t output_height_stride() const {
- if (this->output_height_stride_ != 0) {
- return this->output_height_stride_;
- } else {
- return this->output_width() * this->output_channels() * this->element_size();
- }
- }
-
- inline DepthToSpaceMicrokernelTester& output_width_stride(size_t output_width_stride) {
- assert(output_width_stride != 0);
- this->output_width_stride_ = output_width_stride;
- return *this;
- }
-
- inline size_t output_width_stride() const {
- if (this->output_width_stride_ != 0) {
- return this->output_width_stride_;
- } else {
- return this->output_channels() * this->element_size();
+ return this->output_channels();
}
}
@@ -150,30 +98,14 @@
}
void Test(xnn_x32_depthtospace2d_chw2hwc_ukernel_function depthtospace2d) const {
- ASSERT_EQ(element_size(), sizeof(uint32_t));
ASSERT_GE(block_size(), 2);
- ASSERT_GE(input_channel_stride(), input_height() * input_height_stride());
- ASSERT_GE(input_height_stride(), input_width() * element_size());
- ASSERT_GE(output_height_stride(), input_width() * block_size() * output_width_stride());
- ASSERT_GE(output_width_stride(), output_channels() * element_size());
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
- const size_t input_byte_size =
- (input_channels() - 1) * input_channel_stride() +
- (input_height() - 1) * input_height_stride() +
- input_width() * element_size();
- ASSERT_EQ(input_byte_size % element_size(), 0);
- std::vector<uint32_t> input(input_byte_size / element_size());
-
- const size_t output_byte_size =
- (output_height() - 1) * output_height_stride() +
- (output_width() - 1) * output_width_stride() +
- output_channels() * element_size();
- ASSERT_EQ(output_byte_size % element_size(), 0);
- std::vector<uint32_t> output(output_byte_size / element_size());
+ std::vector<uint32_t> input(input_channels() * input_height() * input_width());
+ std::vector<uint32_t> output((output_height() * output_width() - 1) * output_channel_stride() + output_channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(u32rng));
@@ -186,36 +118,25 @@
block_size(),
input.data(),
output.data(),
- input_channel_stride(),
- input_height_stride(),
- output_height_stride(),
- output_width_stride());
+ output_channel_stride());
// Verify results.
- for (size_t iy = 0; iy < input_height(); ++iy) {
- for (size_t by = 0; by < block_size(); ++by) {
- for (size_t ix = 0; ix < input_width(); ++ix) {
- for (size_t bx = 0; bx < block_size(); ++bx) {
- for (size_t c = 0; c < output_channels(); ++c) {
- size_t input_offset =
- (c * block_size() * block_size() + by * block_size() + bx) * input_channel_stride() +
- iy * input_height_stride() +
- ix * element_size();
- ASSERT_EQ(input_offset % element_size(), 0);
- ASSERT_LT(input_offset / element_size(), input.size());
-
- size_t output_offset =
- (iy * block_size() + by) * output_height_stride() +
- (ix * block_size() + bx) * output_width_stride() +
- c * element_size();
- ASSERT_EQ(output_offset % element_size(), 0);
- ASSERT_LT(output_offset / element_size(), output.size());
-
- ASSERT_EQ(output[output_offset / element_size()],
- input[input_offset / element_size()])
- << "iy = " << iy << ", " << "by = " << by << ", "
- << "ix = " << ix << ", " << "bx = " << bx << ", "
- << "c = " << c;
+ for (size_t iy = 0; iy < input_height(); iy++) {
+ for (size_t by = 0; by < block_size(); by++) {
+ for (size_t ix = 0; ix < input_width(); ix++) {
+ for (size_t bx = 0; bx < block_size(); bx++) {
+ for (size_t oc = 0; oc < output_channels(); oc++) {
+ const size_t input_index =
+ (((by * block_size() + bx) * output_channels() + oc) * input_height() + iy) * input_width() + ix;
+ const size_t output_index =
+ ((iy * block_size() + by) * output_width() + ix * block_size() + bx) * output_channel_stride() + oc;
+ ASSERT_EQ(output[output_index], input[input_index])
+ << "input x: " << ix << " / " << input_width()
+ << ", input y: " << iy << " / " << input_height()
+ << ", block x: " << bx << " / " << block_size()
+ << ", block y: " << by << " / " << block_size()
+ << ", output channel: " << oc << " / " << output_channels()
+ << ", output stride: " << output_channel_stride();
}
}
}
@@ -229,10 +150,6 @@
size_t input_height_{1};
size_t input_width_{1};
size_t block_size_{2};
- size_t element_size_{4};
- size_t input_channel_stride_{0};
- size_t input_height_stride_{0};
- size_t output_height_stride_{0};
- size_t output_width_stride_{0};
+ size_t output_channel_stride_{0};
size_t iterations_{3};
};