Add missing generated unit tests to BUILD and CMakeLists.txt.

Also bump qs8-igemm-minmax-rndnu to 3 files as compile time was still too slow.

PiperOrigin-RevId: 421052556
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index e8a4c93..5a91090 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -23,8 +23,8 @@
 #include "gemm-microkernel-tester.h"
 
 
-#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -34,10 +34,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -48,10 +48,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -64,12 +64,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -81,11 +81,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -97,11 +97,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -112,11 +112,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -130,13 +130,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -147,11 +147,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -165,13 +165,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -182,11 +182,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -200,13 +200,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -218,12 +218,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -236,12 +236,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -255,13 +255,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -273,12 +273,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -291,12 +291,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -310,13 +310,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -328,11 +328,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -347,13 +347,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -366,12 +366,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -384,12 +384,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -404,13 +404,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -423,11 +423,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -442,12 +442,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -458,10 +458,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -472,10 +472,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -486,45 +486,513 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
-#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8) {
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(4)
       .nr(8)
-      .kr(2)
+      .kr(1)
       .sr(1)
-      .m(2)
+      .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(4)
       .nr(8)
-      .kr(2)
+      .kr(1)
       .sr(1)
-      .m(2)
+      .m(4)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
@@ -532,16 +1000,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
@@ -549,48 +1017,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -598,34 +1066,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -633,34 +1101,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -668,54 +1136,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -723,54 +1191,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -778,35 +1246,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -815,55 +1283,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -872,1961 +1340,1025 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -3300,475 +2832,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3777,11 +2841,11 @@
       .sr(1)
       .m(1)
       .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -3790,12 +2854,12 @@
       .sr(1)
       .m(1)
       .n(16)
-      .k(8)
+      .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -3806,14 +2870,14 @@
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -3823,13 +2887,13 @@
         .sr(1)
         .m(m)
         .n(16)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -3839,15 +2903,15 @@
         .sr(1)
         .m(1)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
@@ -3856,13 +2920,13 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
@@ -3874,15 +2938,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
@@ -3891,13 +2955,13 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
@@ -3909,15 +2973,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
@@ -3926,13 +2990,13 @@
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
@@ -3944,16 +3008,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -3962,15 +3026,15 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -3980,15 +3044,15 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
@@ -3999,16 +3063,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -4017,15 +3081,15 @@
           .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -4035,15 +3099,15 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
@@ -4054,15 +3118,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
@@ -4072,13 +3136,13 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
@@ -4091,16 +3155,16 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -4110,15 +3174,15 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
@@ -4128,14 +3192,14 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
@@ -4148,15 +3212,15 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
@@ -4166,539 +3230,71 @@
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MULL_LD2R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
-      .k(8)
+      .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -5172,1410 +3768,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -7512,474 +4704,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8448,6 +5172,474 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MULL_LD4R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -10320,1878 +7512,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD4R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD4R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -12660,6 +7980,474 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(2)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(2)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(2)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(2)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(2)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(2)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(2)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(2)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -13128,322 +8916,322 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
+      .mr(1)
+      .nr(16)
       .kr(4)
       .sr(2)
-      .m(4)
-      .n(8)
+      .m(1)
+      .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
+      .mr(1)
+      .nr(16)
       .kr(4)
       .sr(2)
-      .m(4)
-      .n(8)
+      .m(1)
+      .n(16)
       .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
           .m(m)
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
         .m(m)
-        .n(8)
+        .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
+    for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
-        .n(8)
+        .m(1)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
-        .n(8)
+        .m(1)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
-        .n(8)
+        .m(1)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
+          .m(1)
           .n(n)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
-        .n(8)
+        .m(1)
+        .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
@@ -13451,146 +9239,146 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
+            .mr(1)
+            .nr(16)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(11)
+            .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
+        .mr(1)
+        .nr(16)
         .kr(4)
         .sr(2)
-        .m(4)
-        .n(8)
+        .m(1)
+        .n(16)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
+          .mr(1)
+          .nr(16)
           .kr(4)
           .sr(2)
-          .m(4)
-          .n(8)
+          .m(1)
+          .n(16)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
+      .mr(1)
+      .nr(16)
       .kr(4)
       .sr(2)
-      .m(4)
-      .n(8)
+      .m(1)
+      .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
+      .mr(1)
+      .nr(16)
       .kr(4)
       .sr(2)
-      .m(4)
-      .n(8)
+      .m(1)
+      .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
+      .mr(1)
+      .nr(16)
       .kr(4)
       .sr(2)
-      .m(4)
-      .n(8)
+      .m(1)
+      .n(16)
       .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -14532,474 +10320,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -15468,39 +10788,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(2)
+      .m(1)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(2)
+      .m(1)
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
@@ -15508,16 +10828,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
@@ -15525,48 +10845,48 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15574,34 +10894,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15609,34 +10929,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15644,54 +10964,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15699,54 +11019,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15754,35 +11074,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15791,55 +11111,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -15848,127 +11168,127 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(2)
+      .m(1)
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(2)
+      .m(1)
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4S2__NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(2)
+      .m(1)
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(3)
+      .m(4)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(3)
+      .m(4)
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
@@ -15976,16 +11296,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
@@ -15993,48 +11313,48 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16042,34 +11362,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16077,34 +11397,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16112,54 +11432,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16167,54 +11487,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16222,35 +11542,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16259,55 +11579,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(2)
@@ -16316,89 +11636,557 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(2)
-        .m(3)
+        .m(4)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
+    for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(2)
-          .m(3)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(251)
+          .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(3)
+      .m(4)
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(3)
+      .m(4)
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4S2__NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(2)
-      .m(3)
+      .m(4)
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(4)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(4)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(4)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(4)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 1; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(4)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(4)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(4)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(4)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -17808,2809 +13596,469 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MULL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(4)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(4)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(4)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(4)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
+      .nr(16)
+      .kr(2)
+      .sr(4)
       .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
+      .nr(16)
+      .kr(2)
+      .sr(4)
       .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(m)
-        .n(8)
-        .k(8)
+        .n(16)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
+    for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
           .n(n)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
+            .nr(16)
+            .kr(2)
+            .sr(4)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(11)
+            .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
+        .nr(16)
+        .kr(2)
+        .sr(4)
         .m(1)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
+          .nr(16)
+          .kr(2)
+          .sr(4)
           .m(1)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
+      .nr(16)
+      .kr(2)
+      .sr(4)
       .m(1)
-      .n(8)
-      .k(8)
+      .n(16)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
+      .nr(16)
+      .kr(2)
+      .sr(4)
       .m(1)
-      .n(8)
-      .k(8)
+      .n(16)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
+      .nr(16)
+      .kr(2)
+      .sr(4)
       .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -21552,39 +15000,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
@@ -21592,16 +15040,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
@@ -21609,48 +15057,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21658,34 +15106,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21693,34 +15141,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21728,54 +15176,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21783,54 +15231,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21838,35 +15286,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21875,55 +15323,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -21932,89 +15380,557 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
+    for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -22488,1443 +16404,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
@@ -23932,16 +16444,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
@@ -23949,48 +16461,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -23998,34 +16510,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24033,34 +16545,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24068,54 +16580,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24123,54 +16635,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24178,35 +16690,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24215,55 +16727,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -24272,89 +16784,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -24828,322 +17340,322 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(m)
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(m)
-        .n(8)
+        .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
+    for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
           .n(n)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
@@ -25151,184 +17663,184 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
+            .nr(16)
             .kr(4)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(11)
+            .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
+        .nr(16)
         .kr(4)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MULL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
@@ -25336,16 +17848,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
@@ -25353,48 +17865,48 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25402,34 +17914,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25437,34 +17949,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25472,54 +17984,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25527,54 +18039,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25582,35 +18094,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25619,55 +18131,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -25676,89 +18188,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -26232,1041 +18744,105 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(4)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
+          .nr(16)
           .kr(4)
           .sr(1)
           .m(m)
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
         .m(m)
         .n(16)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27274,34 +18850,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27309,34 +18885,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27344,54 +18920,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27399,54 +18975,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27454,35 +19030,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27491,55 +19067,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -27548,89 +19124,89 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(16)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(16)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_LD2R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(16)
-      .k(8)
+      .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -28572,1878 +20148,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -31380,39 +21084,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
@@ -31420,16 +21124,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
@@ -31437,48 +21141,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31486,34 +21190,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31521,34 +21225,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31556,54 +21260,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31611,54 +21315,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31666,35 +21370,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31703,55 +21407,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(4)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -31760,1025 +21464,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(4)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(1)
+        .m(4)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
+    for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(4)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(1)
+          .m(4)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(1)
+      .m(4)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -33252,39 +22020,507 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(2)
       .sr(1)
       .m(4)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
+      .nr(16)
       .kr(2)
       .sr(1)
       .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
@@ -33292,16 +22528,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
@@ -33309,48 +22545,48 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33358,34 +22594,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33393,34 +22629,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33428,54 +22664,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33483,54 +22719,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33538,35 +22774,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33575,55 +22811,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(1)
             .nr(8)
             .kr(2)
             .sr(1)
@@ -33632,410 +22868,410 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(1)
         .nr(8)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(1)
           .nr(8)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(331)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(1)
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(1)
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(1)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(1)
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(2)
-      .n(16)
+      .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(2)
-      .n(16)
+      .n(8)
       .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(m)
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(m)
-        .n(16)
+        .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
+    for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
           .n(n)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
-        .n(16)
+        .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
@@ -34043,467 +23279,467 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(2)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(19)
+            .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(2)
-        .n(16)
+        .n(8)
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(2)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(2)
-      .n(16)
+      .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(2)
-      .n(16)
+      .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(2)
-      .n(16)
+      .n(8)
       .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(3)
-      .n(16)
+      .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(3)
-      .n(16)
+      .n(8)
       .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(m)
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(m)
-        .n(16)
+        .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
+    for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
-        .n(16)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
           .n(n)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
-        .n(16)
+        .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
@@ -34511,712 +23747,244 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(3)
-            .nr(16)
+            .nr(8)
             .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(19)
+            .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(3)
-        .nr(16)
+        .nr(8)
         .kr(2)
         .sr(1)
         .m(3)
-        .n(16)
+        .n(8)
         .k(k)
         .ks(3)
         .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
-          .nr(16)
+          .nr(8)
           .kr(2)
           .sr(1)
           .m(3)
-          .n(16)
+          .n(8)
           .k(k)
           .ks(3)
           .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(3)
-      .n(16)
+      .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(3)
-      .n(16)
+      .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(3)
-      .n(16)
+      .n(8)
       .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
       .nr(8)
-      .kr(16)
+      .kr(8)
       .sr(1)
       .m(2)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
       .nr(8)
-      .kr(16)
+      .kr(8)
       .sr(1)
       .m(2)
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(m)
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(m)
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -35224,34 +23992,34 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -35259,34 +24027,34 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -35294,54 +24062,54 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -35349,54 +24117,54 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -35404,35 +24172,35 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -35440,56 +24208,56 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -35497,44 +24265,44 @@
           GemmMicrokernelTester()
             .mr(2)
             .nr(8)
-            .kr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
-        .kr(16)
+        .kr(8)
         .sr(1)
         .m(2)
         .n(8)
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
-          .kr(16)
+          .kr(8)
           .sr(1)
           .m(2)
           .n(8)
@@ -35542,51 +24310,519 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
       .nr(8)
-      .kr(16)
+      .kr(8)
       .sr(1)
       .m(2)
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
       .nr(8)
-      .kr(16)
+      .kr(8)
       .sr(1)
       .m(2)
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__AARCH64_NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
       .nr(8)
-      .kr(16)
+      .kr(8)
       .sr(1)
       .m(2)
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 2; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
 
@@ -36060,474 +25296,6 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-
-
-#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -36996,474 +25764,6 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-
-
-#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
@@ -37932,474 +26232,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -38868,39 +26700,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
@@ -38908,16 +26740,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
@@ -38925,48 +26757,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -38974,34 +26806,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39009,34 +26841,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39044,54 +26876,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39099,54 +26931,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39154,35 +26986,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39191,55 +27023,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39248,127 +27080,127 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
+    for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
@@ -39376,16 +27208,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
@@ -39393,48 +27225,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39442,34 +27274,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39477,34 +27309,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39512,54 +27344,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39567,54 +27399,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39622,35 +27454,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39659,55 +27491,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(8)
             .sr(1)
@@ -39716,127 +27548,127 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(127)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
@@ -39844,16 +27676,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
@@ -39861,48 +27693,48 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39910,34 +27742,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39945,34 +27777,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -39980,54 +27812,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -40035,54 +27867,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -40090,35 +27922,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -40127,55 +27959,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -40184,1025 +28016,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
+    for (uint32_t mz = 0; mz < 3; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -41676,2811 +28572,939 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C16__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C16__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
-      .kr(16)
+      .nr(16)
+      .kr(8)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
-      .kr(16)
+      .nr(16)
+      .kr(8)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(m)
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(m)
-        .n(8)
+        .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
+    for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
           .n(n)
           .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
+    for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
+    for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
             .mr(4)
-            .nr(8)
-            .kr(16)
+            .nr(16)
+            .kr(8)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(11)
+            .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(4)
-        .nr(8)
-        .kr(16)
+        .nr(16)
+        .kr(8)
         .sr(1)
         .m(4)
-        .n(8)
+        .n(16)
         .k(k)
         .ks(3)
         .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
-          .nr(8)
-          .kr(16)
+          .nr(16)
+          .kr(8)
           .sr(1)
           .m(4)
-          .n(8)
+          .n(16)
           .k(k)
           .ks(3)
           .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
-      .kr(16)
+      .nr(16)
+      .kr(8)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
-      .kr(16)
+      .nr(16)
+      .kr(8)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C16__NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(8)
-      .kr(16)
+      .nr(16)
+      .kr(8)
       .sr(1)
       .m(4)
-      .n(8)
+      .n(16)
       .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(16)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(16)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C16__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(16)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
-      .kr(4)
+      .kr(16)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
-      .kr(4)
+      .kr(16)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
         .m(m)
         .n(8)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t m = 1; m <= 3; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(8)
-            .kr(4)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(16)
         .sr(1)
-        .m(1)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t mz = 0; mz < 1; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(16)
           .sr(1)
-          .m(1)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
-      .kr(4)
+      .kr(16)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
-      .kr(4)
+      .kr(16)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(8)
-      .kr(4)
+      .kr(16)
       .sr(1)
-      .m(1)
+      .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
-#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
@@ -45887,474 +30911,6 @@
 #endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 8; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 8; m++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 8; m++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 8; m++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 8; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(8)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(8)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(8)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t mz = 0; mz < 8; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(8)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(8)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(8)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(8)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -46824,474 +31380,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -47760,39 +31848,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(6)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(6)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(6)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(6)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
@@ -47800,16 +31888,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
@@ -47817,48 +31905,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 6; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -47866,34 +31954,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 6; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -47901,34 +31989,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 6; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -47936,54 +32024,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 6; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -47991,54 +32079,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 6; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -48046,35 +32134,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 6; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -48083,55 +32171,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 6; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(6)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -48140,1025 +32228,89 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(6)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(6)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
+    for (uint32_t mz = 0; mz < 6; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(6)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(6)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(6)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(6)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(6)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(6)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(6)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(6)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -49632,39 +32784,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 2; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
@@ -49672,16 +32824,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
@@ -49689,48 +32841,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49738,34 +32890,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49773,34 +32925,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49808,54 +32960,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49863,54 +33015,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49918,35 +33070,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -49955,55 +33107,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         for (uint32_t n = 1; n <= 16; n++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -50012,89 +33164,89 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 1; mz++) {
+    for (uint32_t mz = 0; mz < 2; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -50568,39 +33720,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
@@ -50608,16 +33760,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
@@ -50625,48 +33777,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50674,34 +33826,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50709,34 +33861,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50744,54 +33896,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50799,54 +33951,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50854,35 +34006,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50891,55 +34043,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -50948,557 +34100,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 2; mz++) {
+    for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -51971,484 +34655,16 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        for (uint32_t n = 1; n <= 16; n++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(1)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(1)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MULL_ADDW_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(1)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     GemmMicrokernelTester()
       .mr(4)
       .nr(8)
-      .kr(1)
+      .kr(4)
       .sr(1)
       .m(4)
       .n(8)
@@ -52457,15 +34673,15 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     GemmMicrokernelTester()
       .mr(4)
       .nr(8)
-      .kr(1)
+      .kr(4)
       .sr(1)
       .m(4)
       .n(8)
@@ -52475,17 +34691,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(m)
           .n(n)
@@ -52497,16 +34713,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(m)
         .n(8)
@@ -52517,16 +34733,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(n)
@@ -52537,16 +34753,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(8)
@@ -52556,18 +34772,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52580,16 +34796,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(8)
@@ -52599,18 +34815,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52623,16 +34839,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(8)
@@ -52642,18 +34858,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52666,17 +34882,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52687,17 +34903,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52709,18 +34925,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52733,17 +34949,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52754,17 +34970,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(n)
@@ -52776,18 +34992,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52800,16 +35016,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(8)
@@ -52820,18 +35036,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52845,17 +35061,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52867,17 +35083,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52889,18 +35105,18 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
             .mr(4)
             .nr(8)
-            .kr(1)
+            .kr(4)
             .sr(1)
             .m(m)
             .n(n)
@@ -52914,16 +35130,16 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
-        .kr(1)
+        .kr(4)
         .sr(1)
         .m(4)
         .n(8)
@@ -52935,17 +35151,17 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
-          .kr(1)
+          .kr(4)
           .sr(1)
           .m(4)
           .n(8)
@@ -52959,15 +35175,15 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     GemmMicrokernelTester()
       .mr(4)
       .nr(8)
-      .kr(1)
+      .kr(4)
       .sr(1)
       .m(4)
       .n(8)
@@ -52977,15 +35193,15 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     GemmMicrokernelTester()
       .mr(4)
       .nr(8)
-      .kr(1)
+      .kr(4)
       .sr(1)
       .m(4)
       .n(8)
@@ -52995,15 +35211,15 @@
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
 
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     struct xnn_code_buffer code_buffer;
     ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(&code_buffer));
+    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64(&code_buffer));
     GemmMicrokernelTester()
       .mr(4)
       .nr(8)
-      .kr(1)
+      .kr(4)
       .sr(1)
       .m(4)
       .n(8)
@@ -53012,580 +35228,4 @@
       .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
   }
-#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
-
-
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t m = 1; m <= 4; m++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .cn_stride(11)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        for (uint32_t n = 1; n <= 8; n++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    for (uint32_t mz = 0; mz < 4; mz++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    struct xnn_code_buffer code_buffer;
-    ASSERT_EQ(xnn_status_success, xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE));
-    ASSERT_EQ(xnn_status_success, xnn_generate_qs8_igemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(&code_buffer));
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(reinterpret_cast<xnn_qs8_igemm_minmax_ukernel_function>(code_buffer.code), xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    ASSERT_EQ(xnn_status_success, xnn_release_code_memory(&code_buffer));
-  }
-#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT