Roll back the decision to split the packed weights for the CHW IBILINEAR microkernel interface
PiperOrigin-RevId: 338785454
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 355533a..806f985 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -19,7 +19,7 @@
#endif
/// The number of bytes XNNPACK may read beyond array bounds.
-/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
+/// The caller must allocate at this this many extra bytes after the tensor data passed to XNNPACK.
///
/// Note: XNNPACK reads, but never writes beyond array bounds.
#define XNN_EXTRA_BYTES 16
diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh
index 1a97143..1591304 100755
--- a/scripts/generate-f32-ibilinear-chw.sh
+++ b/scripts/generate-f32-ibilinear-chw.sh
@@ -5,9 +5,9 @@
# LICENSE file in the root directory of this source tree.
#################################### Scalar ###################################
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
################################## Unit tests #################################
tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc
diff --git a/src/f32-ibilinear-chw/gen/scalar-p1.c b/src/f32-ibilinear-chw/gen/scalar-p1.c
index 9f49ca7..2fb16eb 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p1.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p1.c
@@ -17,8 +17,7 @@
size_t channels,
const float**restrict input,
size_t input_offset,
- const float*restrict horizontal_weights,
- const float*restrict vertical_weights,
+ const float*restrict weights,
float*restrict output,
size_t input_increment)
{
@@ -29,8 +28,7 @@
size_t c = channels;
do {
const float** i = input;
- const float* wh = horizontal_weights;
- const float* wv = vertical_weights;
+ const float* w = weights;
size_t p = output_pixels;
do {
@@ -40,8 +38,9 @@
const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
- const float valphah = *wh++;
- const float valphav = *wv++;
+ const float valphah = w[0];
+ const float valphav = w[1];
+ w += 2;
const float vtl = *i0;
const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/gen/scalar-p2.c b/src/f32-ibilinear-chw/gen/scalar-p2.c
index 11beb5c..404c654 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p2.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p2.c
@@ -17,8 +17,7 @@
size_t channels,
const float**restrict input,
size_t input_offset,
- const float*restrict horizontal_weights,
- const float*restrict vertical_weights,
+ const float*restrict weights,
float*restrict output,
size_t input_increment)
{
@@ -29,8 +28,7 @@
size_t c = channels;
do {
const float** i = input;
- const float* wh = horizontal_weights;
- const float* wv = vertical_weights;
+ const float* w = weights;
size_t p = output_pixels;
for (; p >= 2; p -= 2) {
@@ -44,12 +42,11 @@
const float* i7 = (const float*) ((uintptr_t) i[7] + input_offset);
i += 2 * 4;
- const float valphah0 = wh[0];
- const float valphav0 = wv[0];
- const float valphah1 = wh[1];
- const float valphav1 = wv[1];
- wh += 2;
- wv += 2;
+ const float valphah0 = w[0];
+ const float valphav0 = w[1];
+ const float valphah1 = w[2];
+ const float valphav1 = w[3];
+ w += 2 * 2;
const float vtl0 = *i0;
const float vtr0 = *i1;
@@ -88,8 +85,9 @@
const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
- const float valphah = *wh++;
- const float valphav = *wv++;
+ const float valphah = w[0];
+ const float valphav = w[1];
+ w += 2;
const float vtl = *i0;
const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/gen/scalar-p4.c b/src/f32-ibilinear-chw/gen/scalar-p4.c
index a43cca6..d198c4f 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p4.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p4.c
@@ -17,8 +17,7 @@
size_t channels,
const float**restrict input,
size_t input_offset,
- const float*restrict horizontal_weights,
- const float*restrict vertical_weights,
+ const float*restrict weights,
float*restrict output,
size_t input_increment)
{
@@ -29,8 +28,7 @@
size_t c = channels;
do {
const float** i = input;
- const float* wh = horizontal_weights;
- const float* wv = vertical_weights;
+ const float* w = weights;
size_t p = output_pixels;
for (; p >= 4; p -= 4) {
@@ -52,16 +50,15 @@
const float* i15 = (const float*) ((uintptr_t) i[15] + input_offset);
i += 4 * 4;
- const float valphah0 = wh[0];
- const float valphav0 = wv[0];
- const float valphah1 = wh[1];
- const float valphav1 = wv[1];
- const float valphah2 = wh[2];
- const float valphav2 = wv[2];
- const float valphah3 = wh[3];
- const float valphav3 = wv[3];
- wh += 4;
- wv += 4;
+ const float valphah0 = w[0];
+ const float valphav0 = w[1];
+ const float valphah1 = w[2];
+ const float valphav1 = w[3];
+ const float valphah2 = w[4];
+ const float valphav2 = w[5];
+ const float valphah3 = w[6];
+ const float valphav3 = w[7];
+ w += 4 * 2;
const float vtl0 = *i0;
const float vtr0 = *i1;
@@ -122,8 +119,9 @@
const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
- const float valphah = *wh++;
- const float valphav = *wv++;
+ const float valphah = w[0];
+ const float valphav = w[1];
+ w += 2;
const float vtl = *i0;
const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/scalar.c.in b/src/f32-ibilinear-chw/scalar.c.in
index 105190d..9172489 100644
--- a/src/f32-ibilinear-chw/scalar.c.in
+++ b/src/f32-ibilinear-chw/scalar.c.in
@@ -3,6 +3,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+$assert CHANNEL_TILE == 1
$assert PIXEL_TILE >= 1
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <assert.h>
@@ -10,13 +11,12 @@
#include <xnnpack/ibilinear.h>
-void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}(
+void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}${"" if CHANNEL_TILE == 1 else "x%d" % CHANNEL_TILE}(
size_t output_pixels,
size_t channels,
const float**restrict input,
size_t input_offset,
- const float*restrict horizontal_weights,
- const float*restrict vertical_weights,
+ const float*restrict weights,
float*restrict output,
size_t input_increment)
{
@@ -27,8 +27,7 @@
size_t c = channels;
do {
const float** i = input;
- const float* wh = horizontal_weights;
- const float* wv = vertical_weights;
+ const float* w = weights;
size_t p = output_pixels;
$if PIXEL_TILE > 1:
@@ -38,10 +37,9 @@
i += ${PIXEL_TILE} * 4;
$for P in range(PIXEL_TILE):
- const float valphah${ABC[P]} = wh[${P}];
- const float valphav${ABC[P]} = wv[${P}];
- wh += ${PIXEL_TILE};
- wv += ${PIXEL_TILE};
+ const float valphah${ABC[P]} = w[${P * 2}];
+ const float valphav${ABC[P]} = w[${P * 2 + 1}];
+ w += ${PIXEL_TILE} * 2;
$for P in range(PIXEL_TILE):
const float vtl${ABC[P]} = *i${P * 4};
@@ -75,8 +73,9 @@
const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
- const float valphah = *wh++;
- const float valphav = *wv++;
+ const float valphah = w[0];
+ const float valphav = w[1];
+ w += 2;
const float vtl = *i0;
const float vtr = *i1;
@@ -103,8 +102,9 @@
const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
i += 4;
- const float valphah = *wh++;
- const float valphav = *wv++;
+ const float valphah = w[0];
+ const float valphav = w[1];
+ w += 2;
const float vtl = *i0;
const float vtr = *i1;
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index f08daf4..5c4252d 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -27,14 +27,13 @@
size_t output_increment);
#define DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(fn_name) \
-XNN_INTERNAL void fn_name( \
- size_t output_pixels, \
- size_t channels, \
- const float** input, \
- size_t input_offset, \
- const float* horizontal_weights, \
- const float* vertical_weights, \
- float* output, \
+XNN_INTERNAL void fn_name( \
+ size_t output_pixels, \
+ size_t channels, \
+ const float** input, \
+ size_t input_offset, \
+ const float* weights, \
+ float* output, \
size_t input_increment);
DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__scalar_c1)
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 11022aa..f504fb9 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1106,8 +1106,7 @@
size_t channels,
const float** input,
size_t input_offset,
- const float* horizontal_weights,
- const float* vertical_weights,
+ const float* weights,
float* output,
size_t input_increment);
@@ -1125,8 +1124,7 @@
size_t channels,
const void** input,
size_t input_offset,
- const float* horizontal_weights,
- const float* vertical_weights,
+ const void* weights,
void* output,
size_t input_increment);
diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc
index a6eecd4..a41440c 100644
--- a/test/f32-ibilinear-chw.cc
+++ b/test/f32-ibilinear-chw.cc
@@ -21,7 +21,7 @@
IBilinearMicrokernelTester()
.pixels(1)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
}
TEST(F32_IBILINEAR_CHW__SCALAR_P1, pixels_gt_1) {
@@ -29,7 +29,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
}
}
@@ -39,7 +39,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
}
}
}
@@ -51,7 +51,7 @@
.pixels(pixels)
.channels(channels)
.input_offset(7)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
}
}
}
@@ -63,7 +63,7 @@
.pixels(pixels)
.channels(channels)
.input_stride(23)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
}
}
}
@@ -73,7 +73,7 @@
IBilinearMicrokernelTester()
.pixels(2)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
TEST(F32_IBILINEAR_CHW__SCALAR_P2, pixels_div_2) {
@@ -81,7 +81,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
@@ -90,7 +90,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
@@ -99,7 +99,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
@@ -109,7 +109,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
}
@@ -121,7 +121,7 @@
.pixels(pixels)
.channels(channels)
.input_offset(7)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
}
@@ -133,7 +133,7 @@
.pixels(pixels)
.channels(channels)
.input_stride(43)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
}
}
}
@@ -143,7 +143,7 @@
IBilinearMicrokernelTester()
.pixels(4)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
TEST(F32_IBILINEAR_CHW__SCALAR_P4, pixels_div_4) {
@@ -151,7 +151,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
@@ -160,7 +160,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
@@ -169,7 +169,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(1)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
@@ -179,7 +179,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
}
@@ -191,7 +191,7 @@
.pixels(pixels)
.channels(channels)
.input_offset(7)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
}
@@ -203,7 +203,7 @@
.pixels(pixels)
.channels(channels)
.input_stride(83)
- .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+ .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
}
}
}
diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h
index 13e127a..6e02086 100644
--- a/test/ibilinear-microkernel-tester.h
+++ b/test/ibilinear-microkernel-tester.h
@@ -145,22 +145,20 @@
}
}
- void Test(xnn_f32_ibilinear_chw_ukernel_function ibilinear) const {
+ void TestCHW(xnn_f32_ibilinear_chw_ukernel_function ibilinear) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
std::vector<const float*> indirection(pixels() * 4);
std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + (channels() - 1) * input_stride() + 4 * pixels());
- std::vector<float, AlignedAllocator<float, 64>> horizontal_weights(pixels());
- std::vector<float, AlignedAllocator<float, 64>> vertical_weights(pixels());
+ std::vector<float, AlignedAllocator<float, 64>> packed_weights(pixels() * 2);
std::vector<float> output(pixels() * channels());
std::vector<float> output_ref(pixels() * channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(input.begin(), input.end(), std::ref(f32rng));
- std::generate(horizontal_weights.begin(), horizontal_weights.end(), std::ref(f32rng));
- std::generate(vertical_weights.begin(), vertical_weights.end(), std::ref(f32rng));
+ std::generate(packed_weights.begin(), packed_weights.end(), std::ref(f32rng));
std::fill(output.begin(), output.end(), nanf(""));
for (size_t i = 0; i < indirection.size(); i++) {
@@ -171,8 +169,8 @@
// Compute reference results.
for (size_t i = 0; i < pixels(); i++) {
for (size_t c = 0; c < channels(); c++) {
- const float alpha_h = horizontal_weights[i];
- const float alpha_v = vertical_weights[i];
+ const float alpha_h = packed_weights[i * 2 + 0];
+ const float alpha_v = packed_weights[i * 2 + 1];
// `c * pixels() + i` because the output is NCHW.
output_ref[c * pixels() + i] =
// `c * indirection.size()` because the input is NCHW.
@@ -187,8 +185,7 @@
ibilinear(
pixels(), channels(),
indirection.data(), input_offset() * sizeof(float),
- horizontal_weights.data(), vertical_weights.data(),
- output.data(), input_stride() * sizeof(float));
+ packed_weights.data(), output.data(), input_stride() * sizeof(float));
// Verify results.
for (size_t c = 0; c < channels(); c++) {
diff --git a/tools/generate-ibilinear-chw-test.py b/tools/generate-ibilinear-chw-test.py
index c71476d..f1a1efb 100755
--- a/tools/generate-ibilinear-chw-test.py
+++ b/tools/generate-ibilinear-chw-test.py
@@ -43,7 +43,7 @@
IBilinearMicrokernelTester()
.pixels(${PIXEL_TILE})
.channels(${CHANNEL_TILE})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
$if PIXEL_TILE > 1:
@@ -54,7 +54,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(${CHANNEL_TILE})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
@@ -65,7 +65,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(${CHANNEL_TILE})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
@@ -76,7 +76,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(${CHANNEL_TILE})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
@@ -89,7 +89,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
}
@@ -102,7 +102,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
}
@@ -115,7 +115,7 @@
IBilinearMicrokernelTester()
.pixels(pixels)
.channels(channels)
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
}
@@ -129,7 +129,7 @@
.pixels(pixels)
.channels(channels)
.input_offset(${next_prime(CHANNEL_TILE * 5 + 1)})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
}
@@ -143,7 +143,7 @@
.pixels(pixels)
.channels(channels)
.input_stride(${next_prime(4 * (PIXEL_TILE * 5) + 1)})
- .Test(${TEST_FUNC});
+ .TestCHW(${TEST_FUNC});
}
}
}