Roll back the decision to split the packed weights for the CHW IBILINEAR microkernel interface

PiperOrigin-RevId: 338785454
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 355533a..806f985 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -19,7 +19,7 @@
 #endif
 
 /// The number of bytes XNNPACK may read beyond array bounds.
-/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
+/// The caller must allocate at this this many extra bytes after the tensor data passed to XNNPACK.
 ///
 /// Note: XNNPACK reads, but never writes beyond array bounds.
 #define XNN_EXTRA_BYTES 16
diff --git a/scripts/generate-f32-ibilinear-chw.sh b/scripts/generate-f32-ibilinear-chw.sh
index 1a97143..1591304 100755
--- a/scripts/generate-f32-ibilinear-chw.sh
+++ b/scripts/generate-f32-ibilinear-chw.sh
@@ -5,9 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
-tools/xngen src/f32-ibilinear-chw/scalar.c.in -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-ibilinear-chw/gen/scalar-p1.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=2 -o src/f32-ibilinear-chw/gen/scalar-p2.c
+tools/xngen src/f32-ibilinear-chw/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=4 -o src/f32-ibilinear-chw/gen/scalar-p4.c
 
 ################################## Unit tests #################################
 tools/generate-ibilinear-chw-test.py --spec test/f32-ibilinear-chw.yaml --output test/f32-ibilinear-chw.cc
diff --git a/src/f32-ibilinear-chw/gen/scalar-p1.c b/src/f32-ibilinear-chw/gen/scalar-p1.c
index 9f49ca7..2fb16eb 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p1.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p1.c
@@ -17,8 +17,7 @@
     size_t channels,
     const float**restrict input,
     size_t input_offset,
-    const float*restrict horizontal_weights,
-    const float*restrict vertical_weights,
+    const float*restrict weights,
     float*restrict output,
     size_t input_increment)
 {
@@ -29,8 +28,7 @@
   size_t c = channels;
   do {
     const float** i = input;
-    const float* wh = horizontal_weights;
-    const float* wv = vertical_weights;
+    const float* w = weights;
 
     size_t p = output_pixels;
     do {
@@ -40,8 +38,9 @@
       const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
       i += 4;
 
-      const float valphah = *wh++;
-      const float valphav = *wv++;
+      const float valphah = w[0];
+      const float valphav = w[1];
+      w += 2;
 
       const float vtl = *i0;
       const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/gen/scalar-p2.c b/src/f32-ibilinear-chw/gen/scalar-p2.c
index 11beb5c..404c654 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p2.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p2.c
@@ -17,8 +17,7 @@
     size_t channels,
     const float**restrict input,
     size_t input_offset,
-    const float*restrict horizontal_weights,
-    const float*restrict vertical_weights,
+    const float*restrict weights,
     float*restrict output,
     size_t input_increment)
 {
@@ -29,8 +28,7 @@
   size_t c = channels;
   do {
     const float** i = input;
-    const float* wh = horizontal_weights;
-    const float* wv = vertical_weights;
+    const float* w = weights;
 
     size_t p = output_pixels;
     for (; p >= 2; p -= 2) {
@@ -44,12 +42,11 @@
       const float* i7 = (const float*) ((uintptr_t) i[7] + input_offset);
       i += 2 * 4;
 
-      const float valphah0 = wh[0];
-      const float valphav0 = wv[0];
-      const float valphah1 = wh[1];
-      const float valphav1 = wv[1];
-      wh += 2;
-      wv += 2;
+      const float valphah0 = w[0];
+      const float valphav0 = w[1];
+      const float valphah1 = w[2];
+      const float valphav1 = w[3];
+      w += 2 * 2;
 
       const float vtl0 = *i0;
       const float vtr0 = *i1;
@@ -88,8 +85,9 @@
       const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
       i += 4;
 
-      const float valphah = *wh++;
-      const float valphav = *wv++;
+      const float valphah = w[0];
+      const float valphav = w[1];
+      w += 2;
 
       const float vtl = *i0;
       const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/gen/scalar-p4.c b/src/f32-ibilinear-chw/gen/scalar-p4.c
index a43cca6..d198c4f 100644
--- a/src/f32-ibilinear-chw/gen/scalar-p4.c
+++ b/src/f32-ibilinear-chw/gen/scalar-p4.c
@@ -17,8 +17,7 @@
     size_t channels,
     const float**restrict input,
     size_t input_offset,
-    const float*restrict horizontal_weights,
-    const float*restrict vertical_weights,
+    const float*restrict weights,
     float*restrict output,
     size_t input_increment)
 {
@@ -29,8 +28,7 @@
   size_t c = channels;
   do {
     const float** i = input;
-    const float* wh = horizontal_weights;
-    const float* wv = vertical_weights;
+    const float* w = weights;
 
     size_t p = output_pixels;
     for (; p >= 4; p -= 4) {
@@ -52,16 +50,15 @@
       const float* i15 = (const float*) ((uintptr_t) i[15] + input_offset);
       i += 4 * 4;
 
-      const float valphah0 = wh[0];
-      const float valphav0 = wv[0];
-      const float valphah1 = wh[1];
-      const float valphav1 = wv[1];
-      const float valphah2 = wh[2];
-      const float valphav2 = wv[2];
-      const float valphah3 = wh[3];
-      const float valphav3 = wv[3];
-      wh += 4;
-      wv += 4;
+      const float valphah0 = w[0];
+      const float valphav0 = w[1];
+      const float valphah1 = w[2];
+      const float valphav1 = w[3];
+      const float valphah2 = w[4];
+      const float valphav2 = w[5];
+      const float valphah3 = w[6];
+      const float valphav3 = w[7];
+      w += 4 * 2;
 
       const float vtl0 = *i0;
       const float vtr0 = *i1;
@@ -122,8 +119,9 @@
       const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
       i += 4;
 
-      const float valphah = *wh++;
-      const float valphav = *wv++;
+      const float valphah = w[0];
+      const float valphav = w[1];
+      w += 2;
 
       const float vtl = *i0;
       const float vtr = *i1;
diff --git a/src/f32-ibilinear-chw/scalar.c.in b/src/f32-ibilinear-chw/scalar.c.in
index 105190d..9172489 100644
--- a/src/f32-ibilinear-chw/scalar.c.in
+++ b/src/f32-ibilinear-chw/scalar.c.in
@@ -3,6 +3,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+$assert CHANNEL_TILE == 1
 $assert PIXEL_TILE >= 1
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 #include <assert.h>
@@ -10,13 +11,12 @@
 #include <xnnpack/ibilinear.h>
 
 
-void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}(
+void xnn_f32_ibilinear_chw_ukernel__scalar_p${PIXEL_TILE}${"" if CHANNEL_TILE == 1 else "x%d" % CHANNEL_TILE}(
     size_t output_pixels,
     size_t channels,
     const float**restrict input,
     size_t input_offset,
-    const float*restrict horizontal_weights,
-    const float*restrict vertical_weights,
+    const float*restrict weights,
     float*restrict output,
     size_t input_increment)
 {
@@ -27,8 +27,7 @@
   size_t c = channels;
   do {
     const float** i = input;
-    const float* wh = horizontal_weights;
-    const float* wv = vertical_weights;
+    const float* w = weights;
 
     size_t p = output_pixels;
     $if PIXEL_TILE > 1:
@@ -38,10 +37,9 @@
         i += ${PIXEL_TILE} * 4;
 
         $for P in range(PIXEL_TILE):
-           const float valphah${ABC[P]} = wh[${P}];
-           const float valphav${ABC[P]} = wv[${P}];
-        wh += ${PIXEL_TILE};
-        wv += ${PIXEL_TILE};
+           const float valphah${ABC[P]} = w[${P * 2}];
+           const float valphav${ABC[P]} = w[${P * 2 + 1}];
+        w += ${PIXEL_TILE} * 2;
 
         $for P in range(PIXEL_TILE):
           const float vtl${ABC[P]} = *i${P * 4};
@@ -75,8 +73,9 @@
         const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
         i += 4;
 
-        const float valphah = *wh++;
-        const float valphav = *wv++;
+        const float valphah = w[0];
+        const float valphav = w[1];
+        w += 2;
 
         const float vtl = *i0;
         const float vtr = *i1;
@@ -103,8 +102,9 @@
         const float* i3 = (const float*) ((uintptr_t) i[3] + input_offset);
         i += 4;
 
-        const float valphah = *wh++;
-        const float valphav = *wv++;
+        const float valphah = w[0];
+        const float valphav = w[1];
+        w += 2;
 
         const float vtl = *i0;
         const float vtr = *i1;
diff --git a/src/xnnpack/ibilinear.h b/src/xnnpack/ibilinear.h
index f08daf4..5c4252d 100644
--- a/src/xnnpack/ibilinear.h
+++ b/src/xnnpack/ibilinear.h
@@ -27,14 +27,13 @@
       size_t output_increment);
 
 #define DECLARE_F32_IBILINEAR_CHW_UKERNEL_FUNCTION(fn_name) \
-XNN_INTERNAL void fn_name(                                  \
-    size_t output_pixels,                                   \
-    size_t channels,                                        \
-    const float** input,                                    \
-    size_t input_offset,                                    \
-    const float* horizontal_weights,                        \
-    const float* vertical_weights,                          \
-    float* output,                                          \
+XNN_INTERNAL void fn_name(                            \
+    size_t output_pixels,                             \
+    size_t channels,                                  \
+    const float** input,                              \
+    size_t input_offset,                              \
+    const float* weights,                             \
+    float* output,                                    \
     size_t input_increment);
 
 DECLARE_F32_IBILINEAR_UKERNEL_FUNCTION(xnn_f32_ibilinear_ukernel__scalar_c1)
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 11022aa..f504fb9 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1106,8 +1106,7 @@
     size_t channels,
     const float** input,
     size_t input_offset,
-    const float* horizontal_weights,
-    const float* vertical_weights,
+    const float* weights,
     float* output,
     size_t input_increment);
 
@@ -1125,8 +1124,7 @@
     size_t channels,
     const void** input,
     size_t input_offset,
-    const float* horizontal_weights,
-    const float* vertical_weights,
+    const void* weights,
     void* output,
     size_t input_increment);
 
diff --git a/test/f32-ibilinear-chw.cc b/test/f32-ibilinear-chw.cc
index a6eecd4..a41440c 100644
--- a/test/f32-ibilinear-chw.cc
+++ b/test/f32-ibilinear-chw.cc
@@ -21,7 +21,7 @@
   IBilinearMicrokernelTester()
     .pixels(1)
     .channels(1)
-    .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+    .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
 }
 
 TEST(F32_IBILINEAR_CHW__SCALAR_P1, pixels_gt_1) {
@@ -29,7 +29,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
   }
 }
 
@@ -39,7 +39,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(channels)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
     }
   }
 }
@@ -51,7 +51,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_offset(7)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
     }
   }
 }
@@ -63,7 +63,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_stride(23)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p1);
     }
   }
 }
@@ -73,7 +73,7 @@
   IBilinearMicrokernelTester()
     .pixels(2)
     .channels(1)
-    .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+    .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
 }
 
 TEST(F32_IBILINEAR_CHW__SCALAR_P2, pixels_div_2) {
@@ -81,7 +81,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
   }
 }
 
@@ -90,7 +90,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
   }
 }
 
@@ -99,7 +99,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
   }
 }
 
@@ -109,7 +109,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(channels)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
     }
   }
 }
@@ -121,7 +121,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_offset(7)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
     }
   }
 }
@@ -133,7 +133,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_stride(43)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p2);
     }
   }
 }
@@ -143,7 +143,7 @@
   IBilinearMicrokernelTester()
     .pixels(4)
     .channels(1)
-    .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+    .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
 }
 
 TEST(F32_IBILINEAR_CHW__SCALAR_P4, pixels_div_4) {
@@ -151,7 +151,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
   }
 }
 
@@ -160,7 +160,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
   }
 }
 
@@ -169,7 +169,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(1)
-      .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+      .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
   }
 }
 
@@ -179,7 +179,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(channels)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
     }
   }
 }
@@ -191,7 +191,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_offset(7)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
     }
   }
 }
@@ -203,7 +203,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_stride(83)
-        .Test(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
+        .TestCHW(xnn_f32_ibilinear_chw_ukernel__scalar_p4);
     }
   }
 }
diff --git a/test/ibilinear-microkernel-tester.h b/test/ibilinear-microkernel-tester.h
index 13e127a..6e02086 100644
--- a/test/ibilinear-microkernel-tester.h
+++ b/test/ibilinear-microkernel-tester.h
@@ -145,22 +145,20 @@
     }
   }
 
-  void Test(xnn_f32_ibilinear_chw_ukernel_function ibilinear) const {
+  void TestCHW(xnn_f32_ibilinear_chw_ukernel_function ibilinear) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
 
     std::vector<const float*> indirection(pixels() * 4);
     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + (channels() - 1) * input_stride() + 4 * pixels());
-    std::vector<float, AlignedAllocator<float, 64>> horizontal_weights(pixels());
-    std::vector<float, AlignedAllocator<float, 64>> vertical_weights(pixels());
+    std::vector<float, AlignedAllocator<float, 64>> packed_weights(pixels() * 2);
     std::vector<float> output(pixels() * channels());
     std::vector<float> output_ref(pixels() * channels());
 
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
       std::generate(input.begin(), input.end(), std::ref(f32rng));
-      std::generate(horizontal_weights.begin(), horizontal_weights.end(), std::ref(f32rng));
-      std::generate(vertical_weights.begin(), vertical_weights.end(), std::ref(f32rng));
+      std::generate(packed_weights.begin(), packed_weights.end(), std::ref(f32rng));
       std::fill(output.begin(), output.end(), nanf(""));
 
       for (size_t i = 0; i < indirection.size(); i++) {
@@ -171,8 +169,8 @@
       // Compute reference results.
       for (size_t i = 0; i < pixels(); i++) {
         for (size_t c = 0; c < channels(); c++) {
-          const float alpha_h = horizontal_weights[i];
-          const float alpha_v = vertical_weights[i];
+          const float alpha_h = packed_weights[i * 2 + 0];
+          const float alpha_v = packed_weights[i * 2 + 1];
           // `c * pixels() + i` because the output is NCHW.
           output_ref[c * pixels() + i] =
             // `c * indirection.size()` because the input is NCHW.
@@ -187,8 +185,7 @@
       ibilinear(
         pixels(), channels(),
         indirection.data(), input_offset() * sizeof(float),
-        horizontal_weights.data(), vertical_weights.data(),
-        output.data(), input_stride() * sizeof(float));
+        packed_weights.data(), output.data(), input_stride() * sizeof(float));
 
       // Verify results.
       for (size_t c = 0; c < channels(); c++) {
diff --git a/tools/generate-ibilinear-chw-test.py b/tools/generate-ibilinear-chw-test.py
index c71476d..f1a1efb 100755
--- a/tools/generate-ibilinear-chw-test.py
+++ b/tools/generate-ibilinear-chw-test.py
@@ -43,7 +43,7 @@
   IBilinearMicrokernelTester()
     .pixels(${PIXEL_TILE})
     .channels(${CHANNEL_TILE})
-    .Test(${TEST_FUNC});
+    .TestCHW(${TEST_FUNC});
 }
 
 $if PIXEL_TILE > 1:
@@ -54,7 +54,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(${CHANNEL_TILE})
-        .Test(${TEST_FUNC});
+        .TestCHW(${TEST_FUNC});
     }
   }
 
@@ -65,7 +65,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(${CHANNEL_TILE})
-        .Test(${TEST_FUNC});
+        .TestCHW(${TEST_FUNC});
     }
   }
 
@@ -76,7 +76,7 @@
     IBilinearMicrokernelTester()
       .pixels(pixels)
       .channels(${CHANNEL_TILE})
-      .Test(${TEST_FUNC});
+      .TestCHW(${TEST_FUNC});
   }
 }
 
@@ -89,7 +89,7 @@
         IBilinearMicrokernelTester()
           .pixels(pixels)
           .channels(channels)
-          .Test(${TEST_FUNC});
+          .TestCHW(${TEST_FUNC});
       }
     }
   }
@@ -102,7 +102,7 @@
         IBilinearMicrokernelTester()
           .pixels(pixels)
           .channels(channels)
-          .Test(${TEST_FUNC});
+          .TestCHW(${TEST_FUNC});
       }
     }
   }
@@ -115,7 +115,7 @@
       IBilinearMicrokernelTester()
         .pixels(pixels)
         .channels(channels)
-        .Test(${TEST_FUNC});
+        .TestCHW(${TEST_FUNC});
     }
   }
 }
@@ -129,7 +129,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_offset(${next_prime(CHANNEL_TILE * 5 + 1)})
-        .Test(${TEST_FUNC});
+        .TestCHW(${TEST_FUNC});
     }
   }
 }
@@ -143,7 +143,7 @@
         .pixels(pixels)
         .channels(channels)
         .input_stride(${next_prime(4 * (PIXEL_TILE * 5) + 1)})
-        .Test(${TEST_FUNC});
+        .TestCHW(${TEST_FUNC});
     }
   }
 }