Split qs8-igemm-minmax-rndnu tests into 1 more file (4 total), seeing compile timeouts in coverage runs

PiperOrigin-RevId: 426186255
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index e7305dc..cb1e58e 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -491,8 +491,8 @@
 #endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY
 
 
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -502,10 +502,10 @@
       .m(4)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -516,10 +516,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -532,12 +532,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -549,11 +549,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -565,11 +565,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -580,11 +580,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -598,13 +598,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -615,11 +615,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -633,13 +633,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -650,11 +650,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -668,13 +668,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -686,12 +686,12 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -704,12 +704,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -723,13 +723,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -741,12 +741,12 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -759,12 +759,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -778,13 +778,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -796,11 +796,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -815,13 +815,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -834,12 +834,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -852,12 +852,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -872,13 +872,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -891,11 +891,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t mz = 0; mz < 4; mz++) {
@@ -910,12 +910,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -926,10 +926,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -940,10 +940,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -954,4693 +954,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
-#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-
-
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD4R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5650,10 +970,10 @@
       .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -5664,10 +984,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -5680,12 +1000,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -5697,11 +1017,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -5713,11 +1033,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -5728,11 +1048,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5746,13 +1066,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -5763,11 +1083,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5781,13 +1101,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -5798,11 +1118,11 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5816,13 +1136,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5834,12 +1154,12 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5852,12 +1172,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5871,13 +1191,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5889,12 +1209,12 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5907,12 +1227,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5926,13 +1246,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -5944,11 +1264,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -5963,13 +1283,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -5982,12 +1302,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -6000,12 +1320,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6020,13 +1340,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -6039,11 +1359,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t mz = 0; mz < 2; mz++) {
@@ -6058,12 +1378,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6074,10 +1394,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6088,10 +1408,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD4R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MULL_LD1R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -6102,7 +1422,1411 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_LD1R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD2R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_LD2R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -7980,6 +4704,1410 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2__NEON_MLAL_LD4R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_LD4R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 3; mz++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(251)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2__NEON_MLAL_LD4R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MLAL_LD4R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -8448,474 +6576,6 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4S2__NEON_MULL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -9384,322 +7044,322 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
+      .mr(4)
+      .nr(8)
       .kr(4)
       .sr(2)
-      .m(1)
-      .n(16)
+      .m(4)
+      .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
+      .mr(4)
+      .nr(8)
       .kr(4)
       .sr(2)
-      .m(1)
-      .n(16)
+      .m(4)
+      .n(8)
       .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
           .m(m)
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
         .m(m)
-        .n(16)
+        .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
+    for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
+        .m(4)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
-        .n(16)
+        .m(4)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
-        .n(16)
+        .m(4)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
-        .n(16)
+        .m(4)
+        .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
-        .n(16)
+        .m(4)
+        .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
@@ -9707,146 +7367,146 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
+    for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
+    for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
+          .m(4)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
+            .mr(4)
+            .nr(8)
             .kr(4)
             .sr(2)
             .m(m)
             .n(n)
             .k(k)
-            .cm_stride(19)
+            .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
+        .mr(4)
+        .nr(8)
         .kr(4)
         .sr(2)
-        .m(1)
-        .n(16)
+        .m(4)
+        .n(8)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
+      for (uint32_t mz = 0; mz < 4; mz++) {
         GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
+          .mr(4)
+          .nr(8)
           .kr(4)
           .sr(2)
-          .m(1)
-          .n(16)
+          .m(4)
+          .n(8)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
+      .mr(4)
+      .nr(8)
       .kr(4)
       .sr(2)
-      .m(1)
-      .n(16)
+      .m(4)
+      .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
+      .mr(4)
+      .nr(8)
       .kr(4)
       .sr(2)
-      .m(1)
-      .n(16)
+      .m(4)
+      .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MULL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
+      .mr(4)
+      .nr(8)
       .kr(4)
       .sr(2)
-      .m(1)
-      .n(16)
+      .m(4)
+      .n(8)
       .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -10788,1879 +8448,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C4S2__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(1)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(1)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4S2__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(1)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(2)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(2)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(2)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4S2__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(2)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(1)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(4)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(4)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(4)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(43)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(1)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2S4__NEON_MULL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(4)
-      .m(1)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12669,11 +8457,11 @@
       .sr(4)
       .m(3)
       .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -12682,12 +8470,12 @@
       .sr(4)
       .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 3; m++) {
@@ -12698,14 +8486,14 @@
           .sr(4)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
@@ -12715,13 +8503,13 @@
         .sr(4)
         .m(m)
         .n(8)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12731,15 +8519,15 @@
         .sr(4)
         .m(3)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
@@ -12748,13 +8536,13 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
@@ -12766,15 +8554,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
@@ -12783,13 +8571,13 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
@@ -12801,15 +8589,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
@@ -12818,13 +8606,13 @@
         .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
@@ -12836,16 +8624,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -12854,15 +8642,15 @@
           .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -12872,15 +8660,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
             .mr(3)
@@ -12891,16 +8679,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -12909,15 +8697,15 @@
           .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -12927,15 +8715,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
             .mr(3)
@@ -12946,15 +8734,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
@@ -12964,13 +8752,13 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
@@ -12983,16 +8771,16 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -13002,15 +8790,15 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
@@ -13020,14 +8808,14 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
@@ -13040,15 +8828,15 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
@@ -13058,14 +8846,14 @@
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 3; mz++) {
         GemmMicrokernelTester()
           .mr(3)
@@ -13076,14 +8864,14 @@
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(127)
+          .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13092,12 +8880,12 @@
       .sr(4)
       .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13106,12 +8894,12 @@
       .sr(4)
       .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
@@ -13120,15 +8908,15 @@
       .sr(4)
       .m(3)
       .n(8)
-      .k(8)
+      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13137,11 +8925,11 @@
       .sr(4)
       .m(4)
       .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13150,12 +8938,12 @@
       .sr(4)
       .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13166,14 +8954,14 @@
           .sr(4)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -13183,13 +8971,13 @@
         .sr(4)
         .m(m)
         .n(8)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13199,15 +8987,15 @@
         .sr(4)
         .m(4)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -13216,13 +9004,13 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
@@ -13234,15 +9022,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -13251,13 +9039,13 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
@@ -13269,15 +9057,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -13286,13 +9074,13 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
@@ -13304,16 +9092,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13322,15 +9110,15 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13340,15 +9128,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -13359,16 +9147,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13377,15 +9165,15 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13395,15 +9183,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -13414,15 +9202,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -13432,13 +9220,13 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
@@ -13451,16 +9239,16 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13470,15 +9258,15 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -13488,14 +9276,14 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
@@ -13508,15 +9296,15 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -13526,14 +9314,14 @@
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 4; mz++) {
         GemmMicrokernelTester()
           .mr(4)
@@ -13544,14 +9332,14 @@
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13560,12 +9348,12 @@
       .sr(4)
       .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13574,12 +9362,12 @@
       .sr(4)
       .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -13588,15 +9376,15 @@
       .sr(4)
       .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13605,11 +9393,11 @@
       .sr(4)
       .m(2)
       .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -13618,12 +9406,12 @@
       .sr(4)
       .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -13634,14 +9422,14 @@
           .sr(4)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -13651,13 +9439,13 @@
         .sr(4)
         .m(m)
         .n(16)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -13667,15 +9455,15 @@
         .sr(4)
         .m(2)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(16)
@@ -13684,13 +9472,13 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -13702,15 +9490,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(16)
@@ -13719,13 +9507,13 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -13737,15 +9525,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(16)
@@ -13754,13 +9542,13 @@
         .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -13772,16 +9560,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13790,15 +9578,15 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13808,15 +9596,15 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
@@ -13827,16 +9615,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13845,15 +9633,15 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13863,15 +9651,15 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
@@ -13882,15 +9670,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(16)
@@ -13900,13 +9688,13 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -13919,16 +9707,16 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13938,15 +9726,15 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(16)
@@ -13956,14 +9744,14 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -13976,15 +9764,15 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(16)
@@ -13994,14 +9782,14 @@
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 2; mz++) {
         GemmMicrokernelTester()
           .mr(2)
@@ -14012,14 +9800,14 @@
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14028,12 +9816,12 @@
       .sr(4)
       .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14042,12 +9830,12 @@
       .sr(4)
       .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -14056,47 +9844,47 @@
       .sr(4)
       .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(2)
       .sr(4)
-      .m(1)
+      .m(3)
       .n(16)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(2)
       .sr(4)
-      .m(1)
+      .m(3)
       .n(16)
       .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
@@ -14104,16 +9892,16 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
@@ -14121,48 +9909,48 @@
         .n(16)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14170,34 +9958,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14205,34 +9993,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14240,54 +10028,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14295,54 +10083,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14350,35 +10138,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14387,55 +10175,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(3)
             .nr(16)
             .kr(2)
             .sr(4)
@@ -14444,127 +10232,127 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(3)
         .nr(16)
         .kr(2)
         .sr(4)
-        .m(1)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(251)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
+      for (uint32_t mz = 0; mz < 3; mz++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(3)
           .nr(16)
           .kr(2)
           .sr(4)
-          .m(1)
+          .m(3)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(251)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(2)
       .sr(4)
-      .m(1)
+      .m(3)
       .n(16)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(2)
       .sr(4)
-      .m(1)
+      .m(3)
       .n(16)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C2S4__NEON_MLAL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C2S4__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(3)
       .nr(16)
       .kr(2)
       .sr(4)
-      .m(1)
+      .m(3)
       .n(16)
       .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
@@ -14572,16 +10360,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
@@ -14589,48 +10377,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14638,34 +10426,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14673,34 +10461,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14708,54 +10496,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14763,54 +10551,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14818,35 +10606,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14855,55 +10643,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(8)
             .kr(4)
             .sr(1)
@@ -14912,89 +10700,89 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(8)
         .kr(4)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(8)
           .kr(4)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(4)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -15468,39 +11256,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
@@ -15508,16 +11296,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
@@ -15525,48 +11313,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15574,34 +11362,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15609,34 +11397,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15644,54 +11432,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15699,54 +11487,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15754,35 +11542,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15791,55 +11579,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(16)
             .kr(4)
             .sr(1)
@@ -15848,557 +11636,89 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(16)
         .kr(4)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(16)
           .kr(4)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(127)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C4__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(16)
       .kr(4)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -16872,7 +12192,1411 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(16)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(16)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(16)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MLAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MULL_LD1R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 2; m++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 2; m++) {
+          GemmMicrokernelTester()
+            .mr(2)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(2)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(2)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
+        GemmMicrokernelTester()
+          .mr(2)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(2)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C4__NEON_MLAL_LD1R, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16882,10 +13606,10 @@
       .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -16896,10 +13620,10 @@
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -16912,12 +13636,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -16929,11 +13653,11 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -16945,11 +13669,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -16960,11 +13684,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -16978,13 +13702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -16995,11 +13719,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17013,13 +13737,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -17030,11 +13754,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17048,13 +13772,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17066,12 +13790,12 @@
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17084,12 +13808,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17103,13 +13827,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17121,12 +13845,12 @@
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17139,12 +13863,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17158,13 +13882,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17176,11 +13900,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17195,13 +13919,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17214,12 +13938,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -17232,12 +13956,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -17252,13 +13976,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -17271,11 +13995,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t mz = 0; mz < 1; mz++) {
@@ -17290,12 +14014,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17306,10 +14030,10 @@
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17320,10 +14044,10 @@
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MULL_LD2R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -17334,949 +14058,13 @@
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(127)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MULL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18286,10 +14074,10 @@
       .m(1)
       .n(8)
       .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18300,10 +14088,10 @@
       .n(8)
       .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -18316,12 +14104,12 @@
           .n(n)
           .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -18333,11 +14121,11 @@
         .n(8)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -18349,11 +14137,11 @@
         .n(n)
         .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
@@ -18364,11 +14152,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18382,13 +14170,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
@@ -18399,11 +14187,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18417,13 +14205,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
@@ -18434,11 +14222,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18452,13 +14240,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18470,12 +14258,12 @@
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18488,12 +14276,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18507,13 +14295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18525,12 +14313,12 @@
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18543,12 +14331,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18562,13 +14350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18580,11 +14368,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18599,13 +14387,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18618,12 +14406,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 80; k += 17) {
@@ -18636,12 +14424,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -18656,13 +14444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
@@ -18675,11 +14463,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 1; mz++) {
@@ -18694,12 +14482,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18710,10 +14498,10 @@
       .n(8)
       .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18724,10 +14512,10 @@
       .n(8)
       .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD1R, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEON_MLAL_LD2R, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -18738,3283 +14526,943 @@
       .n(8)
       .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_eq_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(8)
-      .kr(4)
+      .kr(2)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(1)
       .nr(8)
-      .kr(4)
+      .kr(2)
       .sr(1)
-      .m(3)
+      .m(1)
       .n(8)
-      .k(16)
+      .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
           .m(m)
           .n(n)
-          .k(16)
+          .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
         .m(m)
         .n(8)
-        .k(16)
+        .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(n)
-        .k(16)
+        .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_lt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
+    for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
+    for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
+    for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
+    for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
+    for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, k_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
+    for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
+    for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(1)
         .nr(8)
-        .kr(4)
+        .kr(2)
         .sr(1)
-        .m(3)
+        .m(1)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
+    for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
+      for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(1)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
-          .m(3)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
+    for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(1)
             .nr(8)
-            .kr(4)
+            .kr(2)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(3)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
         .mr(3)
         .nr(8)
-        .kr(4)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
         .sr(1)
         .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(3)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t mz = 0; mz < 3; mz++) {
         GemmMicrokernelTester()
           .mr(3)
           .nr(8)
-          .kr(4)
+          .kr(2)
           .sr(1)
           .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C4__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD1R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(2)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(2)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C4__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(2)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
           .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MULL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C4__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEON_MLAL_LD2R, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MULL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -22488,105 +15936,573 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
-      .nr(16)
+      .nr(8)
       .kr(2)
       .sr(1)
       .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
+    for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 4; m++) {
         GemmMicrokernelTester()
           .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(16)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 32; k <= 160; k += 16) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(2)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(2)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 4; mz++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(2)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(331)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C2__NEON_MLAL_DUP, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(2)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(16)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(16)
+      .kr(2)
+      .sr(1)
+      .m(2)
+      .n(16)
+      .k(16)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 2; m++) {
+        GemmMicrokernelTester()
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
         .m(m)
         .n(16)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22594,34 +16510,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22629,34 +16545,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22664,54 +16580,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22719,54 +16635,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22774,35 +16690,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22811,55 +16727,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(2)
             .nr(16)
             .kr(2)
             .sr(1)
@@ -22868,2433 +16784,93 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(2)
         .nr(16)
         .kr(2)
         .sr(1)
-        .m(4)
+        .m(2)
         .n(16)
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(2)
           .nr(16)
           .kr(2)
           .sr(1)
-          .m(4)
+          .m(2)
           .n(16)
           .k(k)
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(2)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(2)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C2__NEON_MULL_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16C2__NEON_MLAL_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(2)
       .nr(16)
       .kr(2)
       .sr(1)
-      .m(4)
+      .m(2)
       .n(16)
-      .k(8)
+      .k(16)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
-          GemmMicrokernelTester()
-            .mr(1)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(1)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(1)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
-        GemmMicrokernelTester()
-          .mr(1)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(1)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(83)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(1)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(1)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C2__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(2)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(2)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(2)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C2__NEON_MLAL_DUP, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(2)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-
-
-#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
-          GemmMicrokernelTester()
-            .mr(2)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(2)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(2)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
-        GemmMicrokernelTester()
-          .mr(2)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(2)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__AARCH64_NEON_MLAL_PRFM_CORTEX_A53, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(2)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-
-
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__AARCH64_NEON_MLAL_CORTEX_A53, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
@@ -25764,7 +17340,7 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25774,10 +17350,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -25788,10 +17364,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -25804,12 +17380,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -25821,11 +17397,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -25837,11 +17413,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -25852,11 +17428,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25870,13 +17446,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -25887,11 +17463,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25905,13 +17481,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -25922,11 +17498,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -25940,13 +17516,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25958,12 +17534,12 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25976,12 +17552,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -25995,13 +17571,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26013,12 +17589,12 @@
           .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26031,12 +17607,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26050,13 +17626,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -26068,11 +17644,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -26087,13 +17663,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26106,12 +17682,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -26124,12 +17700,12 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -26144,13 +17720,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -26163,11 +17739,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t mz = 0; mz < 4; mz++) {
@@ -26182,12 +17758,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26198,10 +17774,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26212,10 +17788,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -26226,7 +17802,475 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 4; mz++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
 
@@ -26700,34 +18744,970 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(8)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(8)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(43)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C8__NEON_MULL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      for (uint32_t m = 1; m <= 3; m++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 3; m++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        for (uint32_t m = 1; m <= 3; m++) {
+          GemmMicrokernelTester()
+            .mr(3)
+            .nr(16)
+            .kr(8)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(3)
+        .nr(16)
+        .kr(8)
+        .sr(1)
+        .m(3)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 3; mz++) {
+        GemmMicrokernelTester()
+          .mr(3)
+          .nr(16)
+          .kr(8)
+          .sr(1)
+          .m(3)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(127)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MULL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(3)
+      .nr(16)
+      .kr(8)
+      .sr(1)
+      .m(3)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(2)
+      .nr(8)
+      .kr(8)
+      .sr(1)
+      .m(2)
+      .n(8)
+      .k(16)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       for (uint32_t m = 1; m <= 2; m++) {
@@ -26738,14 +19718,14 @@
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
@@ -26755,13 +19735,13 @@
         .sr(1)
         .m(m)
         .n(8)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -26771,15 +19751,15 @@
         .sr(1)
         .m(2)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
@@ -26788,13 +19768,13 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -26806,15 +19786,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
@@ -26823,13 +19803,13 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -26841,15 +19821,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
@@ -26858,13 +19838,13 @@
         .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -26876,16 +19856,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -26894,15 +19874,15 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -26912,15 +19892,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
@@ -26931,16 +19911,16 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -26949,15 +19929,15 @@
           .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -26967,15 +19947,15 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
             .mr(2)
@@ -26986,15 +19966,15 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
@@ -27004,13 +19984,13 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -27023,16 +20003,16 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -27042,15 +20022,15 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(2)
           .nr(8)
@@ -27060,14 +20040,14 @@
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
         for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
@@ -27080,15 +20060,15 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(2)
         .nr(8)
@@ -27098,14 +20078,14 @@
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 2; mz++) {
         GemmMicrokernelTester()
           .mr(2)
@@ -27116,14 +20096,14 @@
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27132,12 +20112,12 @@
       .sr(1)
       .m(2)
       .n(8)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27146,12 +20126,12 @@
       .sr(1)
       .m(2)
       .n(8)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(2)
@@ -27160,113 +20140,113 @@
       .sr(1)
       .m(2)
       .n(8)
-      .k(8)
+      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(4)
       .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
+      for (uint32_t m = 1; m <= 4; m++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
+    for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
         .m(m)
         .n(8)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27274,34 +20254,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27309,34 +20289,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27344,54 +20324,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27399,54 +20379,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 3; m++) {
+      for (size_t k = 1; k <= 80; k += 17) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27454,35 +20434,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27491,55 +20471,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
+        for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
-            .mr(3)
+            .mr(4)
             .nr(8)
             .kr(8)
             .sr(1)
@@ -27548,2431 +20528,1027 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
-        .mr(3)
+        .mr(4)
         .nr(8)
         .kr(8)
         .sr(1)
-        .m(3)
+        .m(4)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(127)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(331)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
+    for (size_t k = 1; k <= 80; k += 17) {
+      for (uint32_t mz = 0; mz < 4; mz++) {
         GemmMicrokernelTester()
-          .mr(3)
+          .mr(4)
           .nr(8)
           .kr(8)
           .sr(1)
-          .m(3)
+          .m(4)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(127)
+          .a_offset(331)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MULL, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8C8__NEON_MLAL, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(3)
+      .mr(4)
       .nr(8)
       .kr(8)
       .sr(1)
-      .m(3)
+      .m(4)
       .n(8)
-      .k(8)
+      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
       .nr(16)
-      .kr(8)
+      .kr(16)
       .sr(1)
       .m(1)
       .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .k(16)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
       .nr(16)
-      .kr(8)
+      .kr(16)
       .sr(1)
       .m(1)
       .n(16)
-      .k(8)
+      .k(16)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       for (uint32_t m = 1; m <= 1; m++) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(m)
           .n(n)
-          .k(8)
+          .k(16)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(m)
         .n(16)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_eq_16_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(n)
-        .k(8)
+        .k(16)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_lt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_lt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
+    for (size_t k = 1; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_gt_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
+    for (size_t k = 17; k < 32; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_div_16) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, k_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
+    for (size_t k = 32; k <= 160; k += 16) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
+      for (size_t k = 1; k <= 80; k += 17) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t n = 1; n <= 16; n++) {
         for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
             .mr(1)
             .nr(16)
-            .kr(8)
+            .kr(16)
             .sr(1)
             .m(m)
             .n(n)
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, a_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       GemmMicrokernelTester()
         .mr(1)
         .nr(16)
-        .kr(8)
+        .kr(16)
         .sr(1)
         .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, zero) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
+    for (size_t k = 1; k <= 80; k += 17) {
       for (uint32_t mz = 0; mz < 1; mz++) {
         GemmMicrokernelTester()
           .mr(1)
           .nr(16)
-          .kr(8)
+          .kr(16)
           .sr(1)
           .m(1)
           .n(16)
           .k(k)
           .ks(3)
+          .a_offset(83)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C16__NEON_MLAL, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(16)
+      .kr(16)
+      .sr(1)
+      .m(1)
+      .n(16)
+      .k(16)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    GemmMicrokernelTester()
+      .mr(1)
+      .nr(8)
+      .kr(4)
+      .sr(1)
+      .m(1)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 1; m++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t m = 1; m <= 1; m++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(n)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 1; m++) {
+          GemmMicrokernelTester()
+            .mr(1)
+            .nr(8)
+            .kr(4)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, a_offset) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(1)
+        .nr(8)
+        .kr(4)
+        .sr(1)
+        .m(1)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, zero) {
+    TEST_REQUIRES_ARM_NEON_DOT;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
+        GemmMicrokernelTester()
+          .mr(1)
+          .nr(8)
+          .kr(4)
+          .sr(1)
+          .m(1)
+          .n(8)
+          .k(k)
+          .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmin) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(16)
-      .kr(8)
+      .nr(8)
+      .kr(4)
       .sr(1)
       .m(1)
-      .n(16)
+      .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmax) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(16)
-      .kr(8)
+      .nr(8)
+      .kr(4)
       .sr(1)
       .m(1)
-      .n(16)
+      .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16C8__NEON_MULL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
       .mr(1)
-      .nr(16)
-      .kr(8)
+      .nr(8)
+      .kr(4)
       .sr(1)
       .m(1)
-      .n(16)
+      .n(8)
       .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C8__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(3)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(3)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16C8__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(3)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(8)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(8)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(331)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(8)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(331)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C8__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(8)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(16)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 3; m++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(16)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 3; m++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_eq_16_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(n)
-        .k(16)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_lt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_lt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_gt_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 17; k < 32; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_div_16) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, k_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 32; k <= 160; k += 16) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 80; k += 17) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 3; m++) {
-          GemmMicrokernelTester()
-            .mr(3)
-            .nr(8)
-            .kr(16)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      GemmMicrokernelTester()
-        .mr(3)
-        .nr(8)
-        .kr(16)
-        .sr(1)
-        .m(3)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 80; k += 17) {
-      for (uint32_t mz = 0; mz < 3; mz++) {
-        GemmMicrokernelTester()
-          .mr(3)
-          .nr(8)
-          .kr(16)
-          .sr(1)
-          .m(3)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8C16__NEON_MLAL, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(3)
-      .nr(8)
-      .kr(16)
-      .sr(1)
-      .m(3)
-      .n(8)
-      .k(16)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
@@ -30911,474 +22487,6 @@
 #endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(16)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 16; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 17; n < 32; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 32; n <= 48; n += 16) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(16)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(19)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(16)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(16)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(16)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(16)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(16)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(16)
-      .k(8)
-      .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-
-
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -32316,39 +23424,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
+      .mr(3)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(6)
+      .m(3)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
+      .mr(3)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(6)
+      .m(3)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 6; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
@@ -32356,16 +23464,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 6; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
@@ -32373,48 +23481,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32422,34 +23530,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32457,34 +23565,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32492,54 +23600,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32547,54 +23655,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32602,35 +23710,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32639,55 +23747,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(6)
+            .mr(3)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -32696,595 +23804,127 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(6)
+        .mr(3)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(6)
+        .m(3)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 6; mz++) {
+      for (uint32_t mz = 0; mz < 3; mz++) {
         GemmMicrokernelTester()
-          .mr(6)
+          .mr(3)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(6)
+          .m(3)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(251)
+          .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
+      .mr(3)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(6)
+      .m(3)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
+      .mr(3)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(6)
+      .m(3)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
+      .mr(3)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(6)
+      .m(3)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(6)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(6)
-      .n(8)
-      .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(6)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(6)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 6; m++) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 6; m++) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(8)
-        .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_div_8) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 6; m++) {
-          GemmMicrokernelTester()
-            .mr(6)
-            .nr(8)
-            .kr(1)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, a_offset) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(6)
-        .nr(8)
-        .kr(1)
-        .sr(1)
-        .m(6)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(251)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, zero) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 6; mz++) {
-        GemmMicrokernelTester()
-          .mr(6)
-          .nr(8)
-          .kr(1)
-          .sr(1)
-          .m(6)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(251)
-          .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(6)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(6)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(6)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(6)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(QS8_IGEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE_PRFM, strided_cm) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(6)
-      .nr(8)
-      .kr(1)
-      .sr(1)
-      .m(6)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON;
-    GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
-      for (uint32_t m = 1; m <= 2; m++) {
+      for (uint32_t m = 1; m <= 1; m++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
@@ -33292,16 +23932,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 2; m++) {
+    for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
@@ -33309,48 +23949,48 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33358,34 +23998,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33393,34 +24033,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33428,54 +24068,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33483,54 +24123,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33538,35 +24178,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33575,55 +24215,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_gt_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, n_div_16_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 16; n++) {
-        for (uint32_t m = 1; m <= 2; m++) {
+        for (uint32_t m = 1; m <= 1; m++) {
           GemmMicrokernelTester()
-            .mr(2)
+            .mr(1)
             .nr(16)
             .kr(1)
             .sr(1)
@@ -33632,89 +24272,89 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(2)
+        .mr(1)
         .nr(16)
         .kr(1)
         .sr(1)
-        .m(2)
+        .m(1)
         .n(16)
         .k(k)
         .ks(3)
-        .a_offset(83)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(43)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 2; mz++) {
+      for (uint32_t mz = 0; mz < 1; mz++) {
         GemmMicrokernelTester()
-          .mr(2)
+          .mr(1)
           .nr(16)
           .kr(1)
           .sr(1)
-          .m(2)
+          .m(1)
           .n(16)
           .k(k)
           .ks(3)
-          .a_offset(83)
+          .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE_PRFM, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE_PRFM, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(2)
+      .mr(1)
       .nr(16)
       .kr(1)
       .sr(1)
-      .m(2)
+      .m(1)
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -34188,39 +24828,39 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 1; m++) {
+      for (uint32_t m = 1; m <= 2; m++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
@@ -34228,16 +24868,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 1; m++) {
+    for (uint32_t m = 1; m <= 2; m++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
@@ -34245,48 +24885,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34294,34 +24934,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34329,34 +24969,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34364,54 +25004,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34419,54 +25059,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34474,35 +25114,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34511,55 +25151,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 1; m++) {
+        for (uint32_t m = 1; m <= 2; m++) {
           GemmMicrokernelTester()
-            .mr(1)
+            .mr(2)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34568,127 +25208,127 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(1)
+        .mr(2)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(1)
+        .m(2)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(43)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(83)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 1; mz++) {
+      for (uint32_t mz = 0; mz < 2; mz++) {
         GemmMicrokernelTester()
-          .mr(1)
+          .mr(2)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(1)
+          .m(2)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(43)
+          .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_1X8__NEON_MULL_ADDW_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_2X8__NEON_MULL_ADDW_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(1)
+      .mr(2)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(1)
+      .m(2)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_eq_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .cn_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t m = 1; m <= 3; m++) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
@@ -34696,16 +25336,16 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
-    for (uint32_t m = 1; m <= 4; m++) {
+    for (uint32_t m = 1; m <= 3; m++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
@@ -34713,48 +25353,48 @@
         .n(8)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_lt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34762,34 +25402,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34797,34 +25437,34 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34832,54 +25472,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_gt_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34887,54 +25527,54 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_div_8) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34942,35 +25582,35 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -34979,55 +25619,55 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 40; k += 9) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(n)
           .k(k)
           .ks(3)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t m = 1; m <= 3; m++) {
           GemmMicrokernelTester()
-            .mr(4)
+            .mr(3)
             .nr(8)
             .kr(1)
             .sr(1)
@@ -35036,556 +25676,88 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
         }
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, a_offset) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
-        .mr(4)
+        .mr(3)
         .nr(8)
         .kr(1)
         .sr(1)
-        .m(4)
+        .m(3)
         .n(8)
         .k(k)
         .ks(3)
-        .a_offset(163)
-        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+        .a_offset(127)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, zero) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
+      for (uint32_t mz = 0; mz < 3; mz++) {
         GemmMicrokernelTester()
-          .mr(4)
+          .mr(3)
           .nr(8)
           .kr(1)
           .sr(1)
-          .m(4)
+          .m(3)
           .n(8)
           .k(k)
           .ks(3)
-          .a_offset(163)
+          .a_offset(127)
           .zero_index(mz)
-          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
       }
     }
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, qmin) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmin(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, qmax) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .qmax(128)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 
-  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__NEON_MULL_ADDW_DUP, strided_cm) {
+  TEST(QS8_IGEMM_MINMAX_RNDNU_3X8__NEON_MULL_ADDW_DUP, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
-      .mr(4)
+      .mr(3)
       .nr(8)
       .kr(1)
       .sr(1)
-      .m(4)
+      .m(3)
       .n(8)
       .k(8)
       .cm_stride(11)
-      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cn_stride(11)
-      .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 8; n++) {
-      for (uint32_t m = 1; m <= 4; m++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(m)
-          .n(n)
-          .k(8)
-          .iterations(1)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_m) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t m = 1; m <= 4; m++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(m)
-        .n(8)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_eq_8_subtile_n) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 1; n <= 8; n++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(n)
-        .k(8)
-        .iterations(1)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_lt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k < 8; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 9; k < 16; k++) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, k_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 16; k <= 80; k += 8) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_strided_cn) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .cn_stride(11)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, small_kernel_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .ks(3)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_gt_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, n_div_8_small_kernel) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 40; k += 9) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(n)
-          .k(k)
-          .ks(3)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cm_subtile) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t n = 1; n <= 8; n++) {
-        for (uint32_t m = 1; m <= 4; m++) {
-          GemmMicrokernelTester()
-            .mr(4)
-            .nr(8)
-            .kr(4)
-            .sr(1)
-            .m(m)
-            .n(n)
-            .k(k)
-            .cm_stride(11)
-            .iterations(1)
-            .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-        }
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, a_offset) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      GemmMicrokernelTester()
-        .mr(4)
-        .nr(8)
-        .kr(4)
-        .sr(1)
-        .m(4)
-        .n(8)
-        .k(k)
-        .ks(3)
-        .a_offset(163)
-        .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, zero) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    for (size_t k = 1; k <= 40; k += 9) {
-      for (uint32_t mz = 0; mz < 4; mz++) {
-        GemmMicrokernelTester()
-          .mr(4)
-          .nr(8)
-          .kr(4)
-          .sr(1)
-          .m(4)
-          .n(8)
-          .k(k)
-          .ks(3)
-          .a_offset(163)
-          .zero_index(mz)
-          .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-      }
-    }
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, qmin) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmin(128)
-      .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, qmax) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .qmax(128)
-      .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-
-  TEST(GENERATE_QS8_IGEMM_RNDNU_4X8C4__AARCH32_NEONDOT_LD64, strided_cm) {
-    TEST_REQUIRES_ARM_NEON_DOT;
-    GemmMicrokernelTester()
-      .mr(4)
-      .nr(8)
-      .kr(4)
-      .sr(1)
-      .m(4)
-      .n(8)
-      .k(8)
-      .cm_stride(11)
-      .Test(xnn_generate_qs8_igemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu);
-  }
-#endif  // XNN_ARCH_ARM && !XNN_PLATFORM_IOS && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT